# global options
knitr::opts_chunk$set(echo = TRUE)
# set working directory
setwd("../coinoculation-endophytes")
This markdown follows the DADA2 Pipeline Tutorial (1.16) (Benjamin Callahan) and sometimes code, comments, and text are entirely copied from the tutorial and pasted here. Modifications were made for these data and this study. Further down, I start a differential abundance analysis with DESeq2. The workflow I follow is also derived from this tutorial by Meeta Mistry, Mary Piper, Jihe Liu, and Radhika Khetani (2021).
Here I will import my forward and reverse reads from the Fluidigm_2020121 V3-V4 amplicon data to DADA2. Reads are already demultiplexed by the sequencing center.
Load packages
library(dada2); packageVersion("dada2") # for ASV sample inference
## Loading required package: Rcpp
## [1] '1.30.0'
library(cowplot); packageVersion("cowplot") # for plotting
## [1] '1.1.1'
library(phyloseq); packageVersion("phyloseq") # to create sequence objects
## [1] '1.46.0'
library(DECIPHER); packageVersion("DECIPHER") # for assigning taxonomy
## Loading required package: Biostrings
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.3.2
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:phyloseq':
##
## distance
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.3.2
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## Loading required package: RSQLite
## Loading required package: parallel
## [1] '2.30.0'
library(Biostrings); packageVersion("Biostrings") # for manipulating sequences
## [1] '2.70.1'
library(tidyverse); packageVersion("tidyverse") # includes ggplot2, dplyr, readr, stringr
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks Biostrings::collapse(), IRanges::collapse()
## ✖ dplyr::combine() masks BiocGenerics::combine()
## ✖ purrr::compact() masks XVector::compact()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice() masks XVector::slice(), IRanges::slice()
## ✖ lubridate::stamp() masks cowplot::stamp()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [1] '2.0.0'
library(pheatmap); packageVersion("pheatmap") # to make a pretty heatmap
## [1] '1.0.12'
library(RColorBrewer); packageVersion("RColorBrewer") # for extra color palettes
## [1] '1.1.3'
library(viridis); packageVersion("viridis") # for extra color palettes
## Loading required package: viridisLite
## [1] '0.6.4'
library(DESeq2); packageVersion("DESeq2") # for differential abundance analysis
## Loading required package: GenomicRanges
## Loading required package: SummarizedExperiment
## Warning: package 'SummarizedExperiment' was built under R version 4.3.2
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
##
## The following object is masked from 'package:dplyr':
##
## count
##
##
## Attaching package: 'MatrixGenerics'
##
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
##
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
##
## Attaching package: 'Biobase'
##
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
##
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## The following object is masked from 'package:phyloseq':
##
## sampleNames
## [1] '1.42.0'
library(edgeR); packageVersion("edgeR") # normalization tools
## Warning: package 'edgeR' was built under R version 4.3.2
## Loading required package: limma
##
## Attaching package: 'limma'
##
## The following object is masked from 'package:DESeq2':
##
## plotMA
##
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
## [1] '4.0.2'
library(ggrepel); packageVersion("ggrepel") # for plots
## [1] '0.9.4'
Define path to reads
path <- "./16S/V3_F357_N_V4_R805" # Change this to the path of your local version of the 16S amplicon .fastq reads
list.files(path)
## [1] "filtered"
## [2] "unmatchedIndex.fastq"
## [3] "V3_F357_N_V4_R805-141_1_N_N_GTGTATGCGT_R1.fastq"
## [4] "V3_F357_N_V4_R805-141_1_N_N_GTGTATGCGT_R2.fastq"
## [5] "V3_F357_N_V4_R805-141_1_N_S_GTATCGTCGT_R1.fastq"
## [6] "V3_F357_N_V4_R805-141_1_N_S_GTATCGTCGT_R2.fastq"
## [7] "V3_F357_N_V4_R805-141_1_R_N_GTCGTCGTCT_R1.fastq"
## [8] "V3_F357_N_V4_R805-141_1_R_N_GTCGTCGTCT_R2.fastq"
## [9] "V3_F357_N_V4_R805-141_1_R_S_TGCTCGTAGT_R1.fastq"
## [10] "V3_F357_N_V4_R805-141_1_R_S_TGCTCGTAGT_R2.fastq"
## [11] "V3_F357_N_V4_R805-141_2_N_N_CGTGCTGTCA_R1.fastq"
## [12] "V3_F357_N_V4_R805-141_2_N_N_CGTGCTGTCA_R2.fastq"
## [13] "V3_F357_N_V4_R805-141_2_N_S_GAGCTAGTGA_R1.fastq"
## [14] "V3_F357_N_V4_R805-141_2_N_S_GAGCTAGTGA_R2.fastq"
## [15] "V3_F357_N_V4_R805-141_2_R_N_GTGCTGTCGT_R1.fastq"
## [16] "V3_F357_N_V4_R805-141_2_R_N_GTGCTGTCGT_R2.fastq"
## [17] "V3_F357_N_V4_R805-141_2_R_S_GATCGTCTCT_R1.fastq"
## [18] "V3_F357_N_V4_R805-141_2_R_S_GATCGTCTCT_R2.fastq"
## [19] "V3_F357_N_V4_R805-141_5_N_N_CGCTATCAGT_R1.fastq"
## [20] "V3_F357_N_V4_R805-141_5_N_N_CGCTATCAGT_R2.fastq"
## [21] "V3_F357_N_V4_R805-141_5_N_S_GAGTGATCGT_R1.fastq"
## [22] "V3_F357_N_V4_R805-141_5_N_S_GAGTGATCGT_R2.fastq"
## [23] "V3_F357_N_V4_R805-141_5_R_N_GCTAGTGAGT_R1.fastq"
## [24] "V3_F357_N_V4_R805-141_5_R_N_GCTAGTGAGT_R2.fastq"
## [25] "V3_F357_N_V4_R805-141_5_R_S_CGCTGTAGTC_R1.fastq"
## [26] "V3_F357_N_V4_R805-141_5_R_S_CGCTGTAGTC_R2.fastq"
## [27] "V3_F357_N_V4_R805-141_7_N_N_GCGTCGTGTA_R1.fastq"
## [28] "V3_F357_N_V4_R805-141_7_N_N_GCGTCGTGTA_R2.fastq"
## [29] "V3_F357_N_V4_R805-141_7_N_S_GTGCGTGTGT_R1.fastq"
## [30] "V3_F357_N_V4_R805-141_7_N_S_GTGCGTGTGT_R2.fastq"
## [31] "V3_F357_N_V4_R805-141_7_R_N_GATGTAGCGT_R1.fastq"
## [32] "V3_F357_N_V4_R805-141_7_R_N_GATGTAGCGT_R2.fastq"
## [33] "V3_F357_N_V4_R805-141_7_R_S_GTCGTGTACT_R1.fastq"
## [34] "V3_F357_N_V4_R805-141_7_R_S_GTCGTGTACT_R2.fastq"
## [35] "V3_F357_N_V4_R805-717_10_N_N_GTGAGAGACA_R1.fastq"
## [36] "V3_F357_N_V4_R805-717_10_N_N_GTGAGAGACA_R2.fastq"
## [37] "V3_F357_N_V4_R805-717_10_N_S_TACATCGCTG_R1.fastq"
## [38] "V3_F357_N_V4_R805-717_10_N_S_TACATCGCTG_R2.fastq"
## [39] "V3_F357_N_V4_R805-717_10_R_N_GCACGTAGCT_R1.fastq"
## [40] "V3_F357_N_V4_R805-717_10_R_N_GCACGTAGCT_R2.fastq"
## [41] "V3_F357_N_V4_R805-717_10_R_S_GACTGTACGT_R1.fastq"
## [42] "V3_F357_N_V4_R805-717_10_R_S_GACTGTACGT_R2.fastq"
## [43] "V3_F357_N_V4_R805-717_5_N_N_TAGTAGCGCG_R1.fastq"
## [44] "V3_F357_N_V4_R805-717_5_N_N_TAGTAGCGCG_R2.fastq"
## [45] "V3_F357_N_V4_R805-717_5_N_S_TACTGAGCTG_R1.fastq"
## [46] "V3_F357_N_V4_R805-717_5_N_S_TACTGAGCTG_R2.fastq"
## [47] "V3_F357_N_V4_R805-717_5_R_N_GTACTCGCGA_R1.fastq"
## [48] "V3_F357_N_V4_R805-717_5_R_N_GTACTCGCGA_R2.fastq"
## [49] "V3_F357_N_V4_R805-717_5_R_S_GACGTCTGCT_R1.fastq"
## [50] "V3_F357_N_V4_R805-717_5_R_S_GACGTCTGCT_R2.fastq"
## [51] "V3_F357_N_V4_R805-717_6_N_N_CGTACTACGT_R1.fastq"
## [52] "V3_F357_N_V4_R805-717_6_N_N_CGTACTACGT_R2.fastq"
## [53] "V3_F357_N_V4_R805-717_6_N_S_TCACGCTATG_R1.fastq"
## [54] "V3_F357_N_V4_R805-717_6_N_S_TCACGCTATG_R2.fastq"
## [55] "V3_F357_N_V4_R805-717_6_R_N_GAGATCAGTC_R1.fastq"
## [56] "V3_F357_N_V4_R805-717_6_R_N_GAGATCAGTC_R2.fastq"
## [57] "V3_F357_N_V4_R805-717_6_R_S_CAGCTGAGTA_R1.fastq"
## [58] "V3_F357_N_V4_R805-717_6_R_S_CAGCTGAGTA_R2.fastq"
## [59] "V3_F357_N_V4_R805-717_9_N_N_TAGACGTGCT_R1.fastq"
## [60] "V3_F357_N_V4_R805-717_9_N_N_TAGACGTGCT_R2.fastq"
## [61] "V3_F357_N_V4_R805-717_9_N_S_TCTGAGCGCA_R1.fastq"
## [62] "V3_F357_N_V4_R805-717_9_N_S_TCTGAGCGCA_R2.fastq"
## [63] "V3_F357_N_V4_R805-717_9_R_N_TCGAGTAGCG_R1.fastq"
## [64] "V3_F357_N_V4_R805-717_9_R_N_TCGAGTAGCG_R2.fastq"
## [65] "V3_F357_N_V4_R805-717_9_R_S_GTGACTCGTC_R1.fastq"
## [66] "V3_F357_N_V4_R805-717_9_R_S_GTGACTCGTC_R2.fastq"
## [67] "V3_F357_N_V4_R805-733_10_N_N_TAGTCTGTCA_R1.fastq"
## [68] "V3_F357_N_V4_R805-733_10_N_N_TAGTCTGTCA_R2.fastq"
## [69] "V3_F357_N_V4_R805-733_10_N_S_CGTATGATGT_R1.fastq"
## [70] "V3_F357_N_V4_R805-733_10_N_S_CGTATGATGT_R2.fastq"
## [71] "V3_F357_N_V4_R805-733_10_R_N_CTAGAGTATC_R1.fastq"
## [72] "V3_F357_N_V4_R805-733_10_R_N_CTAGAGTATC_R2.fastq"
## [73] "V3_F357_N_V4_R805-733_10_R_S_TGTCTCTATC_R1.fastq"
## [74] "V3_F357_N_V4_R805-733_10_R_S_TGTCTCTATC_R2.fastq"
## [75] "V3_F357_N_V4_R805-733_4_N_N_TATCGATGCT_R1.fastq"
## [76] "V3_F357_N_V4_R805-733_4_N_N_TATCGATGCT_R2.fastq"
## [77] "V3_F357_N_V4_R805-733_4_N_S_TGTGTCACTA_R1.fastq"
## [78] "V3_F357_N_V4_R805-733_4_N_S_TGTGTCACTA_R2.fastq"
## [79] "V3_F357_N_V4_R805-733_4_R_N_CATGCATCAT_R1.fastq"
## [80] "V3_F357_N_V4_R805-733_4_R_N_CATGCATCAT_R2.fastq"
## [81] "V3_F357_N_V4_R805-733_4_R_S_TAGAGTCTGT_R1.fastq"
## [82] "V3_F357_N_V4_R805-733_4_R_S_TAGAGTCTGT_R2.fastq"
## [83] "V3_F357_N_V4_R805-733_8_N_N_CGTCTATGAT_R1.fastq"
## [84] "V3_F357_N_V4_R805-733_8_N_N_CGTCTATGAT_R2.fastq"
## [85] "V3_F357_N_V4_R805-733_8_N_S_TGATCAGTCA_R1.fastq"
## [86] "V3_F357_N_V4_R805-733_8_N_S_TGATCAGTCA_R2.fastq"
## [87] "V3_F357_N_V4_R805-733_8_R_N_CTAGATCTGA_R1.fastq"
## [88] "V3_F357_N_V4_R805-733_8_R_N_CTAGATCTGA_R2.fastq"
## [89] "V3_F357_N_V4_R805-733_8_R_S_GTGATACTGA_R1.fastq"
## [90] "V3_F357_N_V4_R805-733_8_R_S_GTGATACTGA_R2.fastq"
## [91] "V3_F357_N_V4_R805-733_9_N_N_CATGAGTGTA_R1.fastq"
## [92] "V3_F357_N_V4_R805-733_9_N_N_CATGAGTGTA_R2.fastq"
## [93] "V3_F357_N_V4_R805-733_9_N_S_TATCATGTGC_R1.fastq"
## [94] "V3_F357_N_V4_R805-733_9_N_S_TATCATGTGC_R2.fastq"
## [95] "V3_F357_N_V4_R805-733_9_R_N_TATCTCATGC_R1.fastq"
## [96] "V3_F357_N_V4_R805-733_9_R_N_TATCTCATGC_R2.fastq"
## [97] "V3_F357_N_V4_R805-733_9_R_S_TGTCGTCATA_R1.fastq"
## [98] "V3_F357_N_V4_R805-733_9_R_S_TGTCGTCATA_R2.fastq"
## [99] "V3_F357_N_V4_R805-KIT_NEG_CTRL_CGAGTGCTGT_R1.fastq"
## [100] "V3_F357_N_V4_R805-KIT_NEG_CTRL_CGAGTGCTGT_R2.fastq"
## [101] "V3_F357_N_V4_R805-KIT_NEG_CTRL2_TCAGATGCTA_R1.fastq"
## [102] "V3_F357_N_V4_R805-KIT_NEG_CTRL2_TCAGATGCTA_R2.fastq"
## [103] "V3_F357_N_V4_R805-WATER_NEG_CTRL_GTATGAGCAC_R1.fastq"
## [104] "V3_F357_N_V4_R805-WATER_NEG_CTRL_GTATGAGCAC_R2.fastq"
## [105] "V3_F357_N_V4_R805-WATER_NEG_CTRL2_TATCAGTCTG_R1.fastq"
## [106] "V3_F357_N_V4_R805-WATER_NEG_CTRL2_TATCAGTCTG_R2.fastq"
## [107] "V3_F357_N_V4_R805-WaterFG_TATAGCACGC_R1.fastq"
## [108] "V3_F357_N_V4_R805-WaterFG_TATAGCACGC_R2.fastq"
## [109] "V3_F357_N_V4_R805-WaterFG2_TATGTACGTG_R1.fastq"
## [110] "V3_F357_N_V4_R805-WaterFG2_TATGTACGTG_R2.fastq"
Reads representing samples co-inoculated strains MAG522 and MAG702A are located within the folder “excluded_amplicon_samples” and are not included here, since the scope of the paper only pertains to 141, 717A, and 733B.
Now we read in the names of the fastq files, and perform some string manipulation to get matched lists of the forward and reverse fastq files.
# Forward and reverse fastq filenames have format: V3_F357_N_V4_R805-SAMPLENAME_BARCODE_R1.fastq and V3_F357_N_V4_R805-SAMPLENAME_BARCODE_R2.fastq
fnFs <- sort(list.files(path, pattern="_R1.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2.fastq", full.names = TRUE))
# Extract sample names, assuming filenames have the above format:
sample.names <- sapply(strsplit(basename(fnFs), "-"), `[`, 2)
sample.names <- sapply(strsplit(basename(sample.names), "_R1"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_T"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_A"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_G"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CA"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CG"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CC"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CTA"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CTT"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CTG"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CTC"), `[`, 1)
sample.names
## [1] "141_1_N_N" "141_1_N_S" "141_1_R_N" "141_1_R_S"
## [5] "141_2_N_N" "141_2_N_S" "141_2_R_N" "141_2_R_S"
## [9] "141_5_N_N" "141_5_N_S" "141_5_R_N" "141_5_R_S"
## [13] "141_7_N_N" "141_7_N_S" "141_7_R_N" "141_7_R_S"
## [17] "717_10_N_N" "717_10_N_S" "717_10_R_N" "717_10_R_S"
## [21] "717_5_N_N" "717_5_N_S" "717_5_R_N" "717_5_R_S"
## [25] "717_6_N_N" "717_6_N_S" "717_6_R_N" "717_6_R_S"
## [29] "717_9_N_N" "717_9_N_S" "717_9_R_N" "717_9_R_S"
## [33] "733_10_N_N" "733_10_N_S" "733_10_R_N" "733_10_R_S"
## [37] "733_4_N_N" "733_4_N_S" "733_4_R_N" "733_4_R_S"
## [41] "733_8_N_N" "733_8_N_S" "733_8_R_N" "733_8_R_S"
## [45] "733_9_N_N" "733_9_N_S" "733_9_R_N" "733_9_R_S"
## [49] "KIT_NEG_CTRL" "KIT_NEG_CTRL2" "WATER_NEG_CTRL" "WATER_NEG_CTRL2"
## [53] "WaterFG" "WaterFG2"
We start by visualizing the quality profiles of the forward reads:
forward_quality_plots <- plotQualityProfile(fnFs[1:86])
save_plot("./16S/forward_quality_plots.png", forward_quality_plots, base_height=12, base_width=24)
forward_quality_plots
In gray-scale is a heat map of the frequency of each quality score at each base position. The mean quality score at each position is shown by the green line, and the quartiles of the quality score distribution by the orange lines. The red line shows the scaled proportion of reads that extend to at least that position (this is more useful for other sequencing technologies, as Illumina reads are typically all the same length, hence the flat red line).
All forward reads seem to be of high quality (except the control samples). Even the end of the sequences all seem to be above or near 30, so I will not truncate the tails.
Now we visualize the quality profile of the reverse reads:
reverse_quality_plots <- plotQualityProfile(fnRs[1:86])
save_plot("./16S/reverse_quality_plot.png", reverse_quality_plots, base_height=12, base_width=24)
reverse_quality_plots
All reverse reads except controls also seem to be of high quality.
Assign the filenames for the filtered fastq.gz files.
# Place filtered files in filtered/ subdirectory
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
Sequence reads in file.path were already demultiplexed
by the sequencing center, so no barcodes should be present. There are,
however, two sets of primers on each read mate. The Fluidigm amplicon
structure results in reads with a CS (Fluidigm-specific) primer and the
V3-V4 primer. Both need to be removed from each end. According to the
sequencing results from the W. M. Keck Center:
fwd16Sprimer <- "CCTACGGGNGGCWGCAG"
rev16Sprimer <- "GACTACHVGGGTATCTAATCC"
CS1primer <- "AGACCAAGTCTCTGC"
CS2primer <- "TGTAGAACCATGTC"
fwdprimerlen <- nchar(fwd16Sprimer) + nchar(CS1primer)
revprimerlen <- nchar(rev16Sprimer) + nchar(CS2primer)
Filtering parameters: The standard filtering parameters (maxN=0 (DADA2 requires no N’s), truncQ=2, rm.phix=TRUE and maxEE=2) are starting points, not set in stone. If you want to speed up downstream computation, consider tightening maxEE. If too few reads are passing the filter, consider relaxing maxEE, perhaps especially on the reverse reads (eg. maxEE=c(2,5)), and reducing the truncLen to remove low quality tails. Remember though, when choosing truncLen for paired-end reads you must maintain overlap after truncation in order to merge them later.
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs,
trimLeft = c(fwdprimerlen, revprimerlen), # removes primers (FWD_PRIMER_LEN, REV_PRIMER_LEN)
truncLen=c(0,0), # Removes tails: first number is truncation site for forwards reads, second is for reverse reads
maxN=0, # After truncation, sequences with more than maxN Ns will be discarded (dada2 does not allow Ns)
maxEE=c(2,5), # Sets the maximum number of “expected errors” allowed in a read (forward, reverse)
truncQ=2, # Truncate reads at the first instance of a quality score less than or equal to truncQ
# minQ=30, # Only keep reads with a quality score > 30 (high-quality)
rm.phix=TRUE,
compress=TRUE, verbose=TRUE, multithread=TRUE) # On Windows set multithread=FALSE
out
## reads.in reads.out
## V3_F357_N_V4_R805-141_1_N_N_GTGTATGCGT_R1.fastq 5473 4890
## V3_F357_N_V4_R805-141_1_N_S_GTATCGTCGT_R1.fastq 3995 3605
## V3_F357_N_V4_R805-141_1_R_N_GTCGTCGTCT_R1.fastq 3115 2767
## V3_F357_N_V4_R805-141_1_R_S_TGCTCGTAGT_R1.fastq 3535 3159
## V3_F357_N_V4_R805-141_2_N_N_CGTGCTGTCA_R1.fastq 7526 6747
## V3_F357_N_V4_R805-141_2_N_S_GAGCTAGTGA_R1.fastq 8069 7176
## V3_F357_N_V4_R805-141_2_R_N_GTGCTGTCGT_R1.fastq 7526 6677
## V3_F357_N_V4_R805-141_2_R_S_GATCGTCTCT_R1.fastq 3975 3473
## V3_F357_N_V4_R805-141_5_N_N_CGCTATCAGT_R1.fastq 6377 5680
## V3_F357_N_V4_R805-141_5_N_S_GAGTGATCGT_R1.fastq 4097 3651
## V3_F357_N_V4_R805-141_5_R_N_GCTAGTGAGT_R1.fastq 5271 4688
## V3_F357_N_V4_R805-141_5_R_S_CGCTGTAGTC_R1.fastq 5682 4987
## V3_F357_N_V4_R805-141_7_N_N_GCGTCGTGTA_R1.fastq 7335 6646
## V3_F357_N_V4_R805-141_7_N_S_GTGCGTGTGT_R1.fastq 3881 3522
## V3_F357_N_V4_R805-141_7_R_N_GATGTAGCGT_R1.fastq 4423 3943
## V3_F357_N_V4_R805-141_7_R_S_GTCGTGTACT_R1.fastq 4497 3954
## V3_F357_N_V4_R805-717_10_N_N_GTGAGAGACA_R1.fastq 10464 9461
## V3_F357_N_V4_R805-717_10_N_S_TACATCGCTG_R1.fastq 8360 7529
## V3_F357_N_V4_R805-717_10_R_N_GCACGTAGCT_R1.fastq 8229 7225
## V3_F357_N_V4_R805-717_10_R_S_GACTGTACGT_R1.fastq 6610 5949
## V3_F357_N_V4_R805-717_5_N_N_TAGTAGCGCG_R1.fastq 7726 7020
## V3_F357_N_V4_R805-717_5_N_S_TACTGAGCTG_R1.fastq 7510 6754
## V3_F357_N_V4_R805-717_5_R_N_GTACTCGCGA_R1.fastq 6888 6122
## V3_F357_N_V4_R805-717_5_R_S_GACGTCTGCT_R1.fastq 5998 5234
## V3_F357_N_V4_R805-717_6_N_N_CGTACTACGT_R1.fastq 7647 6792
## V3_F357_N_V4_R805-717_6_N_S_TCACGCTATG_R1.fastq 6983 6198
## V3_F357_N_V4_R805-717_6_R_N_GAGATCAGTC_R1.fastq 7250 6418
## V3_F357_N_V4_R805-717_6_R_S_CAGCTGAGTA_R1.fastq 5063 4429
## V3_F357_N_V4_R805-717_9_N_N_TAGACGTGCT_R1.fastq 5144 4665
## V3_F357_N_V4_R805-717_9_N_S_TCTGAGCGCA_R1.fastq 6497 5781
## V3_F357_N_V4_R805-717_9_R_N_TCGAGTAGCG_R1.fastq 4916 4360
## V3_F357_N_V4_R805-717_9_R_S_GTGACTCGTC_R1.fastq 4581 4059
## V3_F357_N_V4_R805-733_10_N_N_TAGTCTGTCA_R1.fastq 10293 9289
## V3_F357_N_V4_R805-733_10_N_S_CGTATGATGT_R1.fastq 10237 9129
## V3_F357_N_V4_R805-733_10_R_N_CTAGAGTATC_R1.fastq 5429 4812
## V3_F357_N_V4_R805-733_10_R_S_TGTCTCTATC_R1.fastq 6413 5743
## V3_F357_N_V4_R805-733_4_N_N_TATCGATGCT_R1.fastq 8426 7434
## V3_F357_N_V4_R805-733_4_N_S_TGTGTCACTA_R1.fastq 7478 6840
## V3_F357_N_V4_R805-733_4_R_N_CATGCATCAT_R1.fastq 7036 6173
## V3_F357_N_V4_R805-733_4_R_S_TAGAGTCTGT_R1.fastq 3665 3274
## V3_F357_N_V4_R805-733_8_N_N_CGTCTATGAT_R1.fastq 7841 6912
## V3_F357_N_V4_R805-733_8_N_S_TGATCAGTCA_R1.fastq 7764 7038
## V3_F357_N_V4_R805-733_8_R_N_CTAGATCTGA_R1.fastq 7260 6448
## V3_F357_N_V4_R805-733_8_R_S_GTGATACTGA_R1.fastq 6018 5235
## V3_F357_N_V4_R805-733_9_N_N_CATGAGTGTA_R1.fastq 1366 1214
## V3_F357_N_V4_R805-733_9_N_S_TATCATGTGC_R1.fastq 6092 5519
## V3_F357_N_V4_R805-733_9_R_N_TATCTCATGC_R1.fastq 3648 3270
## V3_F357_N_V4_R805-733_9_R_S_TGTCGTCATA_R1.fastq 4583 4132
## V3_F357_N_V4_R805-KIT_NEG_CTRL_CGAGTGCTGT_R1.fastq 24 12
## V3_F357_N_V4_R805-KIT_NEG_CTRL2_TCAGATGCTA_R1.fastq 35 20
## V3_F357_N_V4_R805-WATER_NEG_CTRL_GTATGAGCAC_R1.fastq 35 22
## V3_F357_N_V4_R805-WATER_NEG_CTRL2_TATCAGTCTG_R1.fastq 48 18
## V3_F357_N_V4_R805-WaterFG_TATAGCACGC_R1.fastq 27 16
## V3_F357_N_V4_R805-WaterFG2_TATGTACGTG_R1.fastq 28 16
The next thing we want to do is “dereplicate” the filtered fastq files. During dereplication, we condense the data by collapsing together all reads that encode the same sequence, which significantly reduces later computation times:
derepFs <- derepFastq(filtFs, verbose=TRUE)
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_N_N_F_filt.fastq.gz
## Encountered 1081 unique sequences from 4890 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_N_S_F_filt.fastq.gz
## Encountered 978 unique sequences from 3605 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_R_N_F_filt.fastq.gz
## Encountered 1098 unique sequences from 2767 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_R_S_F_filt.fastq.gz
## Encountered 821 unique sequences from 3159 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_N_N_F_filt.fastq.gz
## Encountered 1171 unique sequences from 6747 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_N_S_F_filt.fastq.gz
## Encountered 1374 unique sequences from 7176 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_R_N_F_filt.fastq.gz
## Encountered 1740 unique sequences from 6677 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_R_S_F_filt.fastq.gz
## Encountered 782 unique sequences from 3473 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_N_N_F_filt.fastq.gz
## Encountered 1029 unique sequences from 5680 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_N_S_F_filt.fastq.gz
## Encountered 897 unique sequences from 3651 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_R_N_F_filt.fastq.gz
## Encountered 1691 unique sequences from 4688 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_R_S_F_filt.fastq.gz
## Encountered 1071 unique sequences from 4987 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_N_N_F_filt.fastq.gz
## Encountered 1293 unique sequences from 6646 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_N_S_F_filt.fastq.gz
## Encountered 863 unique sequences from 3522 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_R_N_F_filt.fastq.gz
## Encountered 1499 unique sequences from 3943 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_R_S_F_filt.fastq.gz
## Encountered 832 unique sequences from 3954 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_N_N_F_filt.fastq.gz
## Encountered 1658 unique sequences from 9461 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_N_S_F_filt.fastq.gz
## Encountered 1300 unique sequences from 7529 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_R_N_F_filt.fastq.gz
## Encountered 3068 unique sequences from 7225 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_R_S_F_filt.fastq.gz
## Encountered 985 unique sequences from 5949 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_N_N_F_filt.fastq.gz
## Encountered 1381 unique sequences from 7020 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_N_S_F_filt.fastq.gz
## Encountered 1281 unique sequences from 6754 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_R_N_F_filt.fastq.gz
## Encountered 2109 unique sequences from 6122 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_R_S_F_filt.fastq.gz
## Encountered 1064 unique sequences from 5234 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_N_N_F_filt.fastq.gz
## Encountered 1273 unique sequences from 6792 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_N_S_F_filt.fastq.gz
## Encountered 1188 unique sequences from 6198 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_R_N_F_filt.fastq.gz
## Encountered 2419 unique sequences from 6418 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_R_S_F_filt.fastq.gz
## Encountered 907 unique sequences from 4429 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_N_N_F_filt.fastq.gz
## Encountered 1099 unique sequences from 4665 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_N_S_F_filt.fastq.gz
## Encountered 1221 unique sequences from 5781 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_R_N_F_filt.fastq.gz
## Encountered 1508 unique sequences from 4360 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_R_S_F_filt.fastq.gz
## Encountered 884 unique sequences from 4059 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_N_N_F_filt.fastq.gz
## Encountered 2059 unique sequences from 9289 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_N_S_F_filt.fastq.gz
## Encountered 1622 unique sequences from 9129 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_R_N_F_filt.fastq.gz
## Encountered 2041 unique sequences from 4812 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_R_S_F_filt.fastq.gz
## Encountered 1207 unique sequences from 5743 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_N_N_F_filt.fastq.gz
## Encountered 2192 unique sequences from 7434 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_N_S_F_filt.fastq.gz
## Encountered 1247 unique sequences from 6840 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_R_N_F_filt.fastq.gz
## Encountered 1535 unique sequences from 6173 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_R_S_F_filt.fastq.gz
## Encountered 827 unique sequences from 3274 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_N_N_F_filt.fastq.gz
## Encountered 1231 unique sequences from 6912 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_N_S_F_filt.fastq.gz
## Encountered 1340 unique sequences from 7038 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_R_N_F_filt.fastq.gz
## Encountered 1646 unique sequences from 6448 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_R_S_F_filt.fastq.gz
## Encountered 1432 unique sequences from 5235 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_N_N_F_filt.fastq.gz
## Encountered 413 unique sequences from 1214 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_N_S_F_filt.fastq.gz
## Encountered 1114 unique sequences from 5519 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_R_N_F_filt.fastq.gz
## Encountered 765 unique sequences from 3270 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_R_S_F_filt.fastq.gz
## Encountered 1007 unique sequences from 4132 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/KIT_NEG_CTRL_F_filt.fastq.gz
## Encountered 8 unique sequences from 12 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/KIT_NEG_CTRL2_F_filt.fastq.gz
## Encountered 12 unique sequences from 20 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WATER_NEG_CTRL_F_filt.fastq.gz
## Encountered 14 unique sequences from 22 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WATER_NEG_CTRL2_F_filt.fastq.gz
## Encountered 11 unique sequences from 18 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WaterFG_F_filt.fastq.gz
## Encountered 12 unique sequences from 16 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WaterFG2_F_filt.fastq.gz
## Encountered 9 unique sequences from 16 total sequences read.
derepRs <- derepFastq(filtRs, verbose=TRUE)
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_N_N_R_filt.fastq.gz
## Encountered 1098 unique sequences from 4890 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_N_S_R_filt.fastq.gz
## Encountered 942 unique sequences from 3605 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_R_N_R_filt.fastq.gz
## Encountered 1025 unique sequences from 2767 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_R_S_R_filt.fastq.gz
## Encountered 805 unique sequences from 3159 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_N_N_R_filt.fastq.gz
## Encountered 1283 unique sequences from 6747 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_N_S_R_filt.fastq.gz
## Encountered 1436 unique sequences from 7176 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_R_N_R_filt.fastq.gz
## Encountered 1654 unique sequences from 6677 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_R_S_R_filt.fastq.gz
## Encountered 788 unique sequences from 3473 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_N_N_R_filt.fastq.gz
## Encountered 1078 unique sequences from 5680 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_N_S_R_filt.fastq.gz
## Encountered 930 unique sequences from 3651 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_R_N_R_filt.fastq.gz
## Encountered 1556 unique sequences from 4688 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_R_S_R_filt.fastq.gz
## Encountered 1099 unique sequences from 4987 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_N_N_R_filt.fastq.gz
## Encountered 1305 unique sequences from 6646 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_N_S_R_filt.fastq.gz
## Encountered 819 unique sequences from 3522 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_R_N_R_filt.fastq.gz
## Encountered 1452 unique sequences from 3943 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_R_S_R_filt.fastq.gz
## Encountered 846 unique sequences from 3954 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_N_N_R_filt.fastq.gz
## Encountered 1719 unique sequences from 9461 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_N_S_R_filt.fastq.gz
## Encountered 1404 unique sequences from 7529 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_R_N_R_filt.fastq.gz
## Encountered 2973 unique sequences from 7225 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_R_S_R_filt.fastq.gz
## Encountered 1029 unique sequences from 5949 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_N_N_R_filt.fastq.gz
## Encountered 1465 unique sequences from 7020 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_N_S_R_filt.fastq.gz
## Encountered 1378 unique sequences from 6754 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_R_N_R_filt.fastq.gz
## Encountered 2014 unique sequences from 6122 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_R_S_R_filt.fastq.gz
## Encountered 1018 unique sequences from 5234 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_N_N_R_filt.fastq.gz
## Encountered 1381 unique sequences from 6792 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_N_S_R_filt.fastq.gz
## Encountered 1248 unique sequences from 6198 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_R_N_R_filt.fastq.gz
## Encountered 2277 unique sequences from 6418 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_R_S_R_filt.fastq.gz
## Encountered 985 unique sequences from 4429 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_N_N_R_filt.fastq.gz
## Encountered 1101 unique sequences from 4665 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_N_S_R_filt.fastq.gz
## Encountered 1243 unique sequences from 5781 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_R_N_R_filt.fastq.gz
## Encountered 1458 unique sequences from 4360 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_R_S_R_filt.fastq.gz
## Encountered 967 unique sequences from 4059 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_N_N_R_filt.fastq.gz
## Encountered 2014 unique sequences from 9289 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_N_S_R_filt.fastq.gz
## Encountered 1700 unique sequences from 9129 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_R_N_R_filt.fastq.gz
## Encountered 1887 unique sequences from 4812 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_R_S_R_filt.fastq.gz
## Encountered 1189 unique sequences from 5743 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_N_N_R_filt.fastq.gz
## Encountered 1741 unique sequences from 7434 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_N_S_R_filt.fastq.gz
## Encountered 1301 unique sequences from 6840 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_R_N_R_filt.fastq.gz
## Encountered 1524 unique sequences from 6173 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_R_S_R_filt.fastq.gz
## Encountered 870 unique sequences from 3274 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_N_N_R_filt.fastq.gz
## Encountered 1345 unique sequences from 6912 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_N_S_R_filt.fastq.gz
## Encountered 1375 unique sequences from 7038 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_R_N_R_filt.fastq.gz
## Encountered 1701 unique sequences from 6448 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_R_S_R_filt.fastq.gz
## Encountered 1599 unique sequences from 5235 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_N_N_R_filt.fastq.gz
## Encountered 413 unique sequences from 1214 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_N_S_R_filt.fastq.gz
## Encountered 1131 unique sequences from 5519 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_R_N_R_filt.fastq.gz
## Encountered 734 unique sequences from 3270 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_R_S_R_filt.fastq.gz
## Encountered 1049 unique sequences from 4132 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/KIT_NEG_CTRL_R_filt.fastq.gz
## Encountered 10 unique sequences from 12 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/KIT_NEG_CTRL2_R_filt.fastq.gz
## Encountered 13 unique sequences from 20 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WATER_NEG_CTRL_R_filt.fastq.gz
## Encountered 15 unique sequences from 22 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WATER_NEG_CTRL2_R_filt.fastq.gz
## Encountered 11 unique sequences from 18 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WaterFG_R_filt.fastq.gz
## Encountered 11 unique sequences from 16 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WaterFG2_R_filt.fastq.gz
## Encountered 10 unique sequences from 16 total sequences read.
Dereplication is a common step in almost all modern ASV inference (or OTU picking) pipelines, but a unique feature of derepFastq is that it maintains a summary of the quality information for each dereplicated sequence in $quals.
The DADA2 algorithm makes use of a parametric error model (err) and every amplicon dataset has a different set of error rates. The learnErrors method learns this error model from the data, by alternating estimation of the error rates and inference of sample composition until they converge on a jointly consistent solution. As in many machine-learning problems, the algorithm must begin with an initial guess, for which the maximum possible error rates in this data are used (the error rates if only the most abundant sequence is correct and all the rest are errors).
errF <- learnErrors(derepFs, multithread=TRUE)
## 58015566 total bases in 266127 reads from 54 samples will be used for learning the error rates.
errR <- learnErrors(derepRs, multithread=TRUE)
## 57217221 total bases in 266127 reads from 54 samples will be used for learning the error rates.
Visualize estimated error rates:
plotErrors(errF, nominalQ=TRUE)
## Warning: Transformation introduced infinite values in continuous y-axis
plotErrors(errR, nominalQ=TRUE)
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
The error rates for each possible transition (A→C, A→G, …) are shown. Points are the observed error rates for each consensus quality score. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. Here the estimated error rates (black line) are a good fit to the observed rates (points), and the error rates drop with increased quality as expected. Everything looks reasonable and we proceed with confidence.
We are now ready to apply the core sample inference algorithm to the filtered and trimmed sequence data.
dadaFs <- dada(derepFs, err=errF, multithread=TRUE)
## Sample 1 - 4890 reads in 1081 unique sequences.
## Sample 2 - 3605 reads in 978 unique sequences.
## Sample 3 - 2767 reads in 1098 unique sequences.
## Sample 4 - 3159 reads in 821 unique sequences.
## Sample 5 - 6747 reads in 1171 unique sequences.
## Sample 6 - 7176 reads in 1374 unique sequences.
## Sample 7 - 6677 reads in 1740 unique sequences.
## Sample 8 - 3473 reads in 782 unique sequences.
## Sample 9 - 5680 reads in 1029 unique sequences.
## Sample 10 - 3651 reads in 897 unique sequences.
## Sample 11 - 4688 reads in 1691 unique sequences.
## Sample 12 - 4987 reads in 1071 unique sequences.
## Sample 13 - 6646 reads in 1293 unique sequences.
## Sample 14 - 3522 reads in 863 unique sequences.
## Sample 15 - 3943 reads in 1499 unique sequences.
## Sample 16 - 3954 reads in 832 unique sequences.
## Sample 17 - 9461 reads in 1658 unique sequences.
## Sample 18 - 7529 reads in 1300 unique sequences.
## Sample 19 - 7225 reads in 3068 unique sequences.
## Sample 20 - 5949 reads in 985 unique sequences.
## Sample 21 - 7020 reads in 1381 unique sequences.
## Sample 22 - 6754 reads in 1281 unique sequences.
## Sample 23 - 6122 reads in 2109 unique sequences.
## Sample 24 - 5234 reads in 1064 unique sequences.
## Sample 25 - 6792 reads in 1273 unique sequences.
## Sample 26 - 6198 reads in 1188 unique sequences.
## Sample 27 - 6418 reads in 2419 unique sequences.
## Sample 28 - 4429 reads in 907 unique sequences.
## Sample 29 - 4665 reads in 1099 unique sequences.
## Sample 30 - 5781 reads in 1221 unique sequences.
## Sample 31 - 4360 reads in 1508 unique sequences.
## Sample 32 - 4059 reads in 884 unique sequences.
## Sample 33 - 9289 reads in 2059 unique sequences.
## Sample 34 - 9129 reads in 1622 unique sequences.
## Sample 35 - 4812 reads in 2041 unique sequences.
## Sample 36 - 5743 reads in 1207 unique sequences.
## Sample 37 - 7434 reads in 2192 unique sequences.
## Sample 38 - 6840 reads in 1247 unique sequences.
## Sample 39 - 6173 reads in 1535 unique sequences.
## Sample 40 - 3274 reads in 827 unique sequences.
## Sample 41 - 6912 reads in 1231 unique sequences.
## Sample 42 - 7038 reads in 1340 unique sequences.
## Sample 43 - 6448 reads in 1646 unique sequences.
## Sample 44 - 5235 reads in 1432 unique sequences.
## Sample 45 - 1214 reads in 413 unique sequences.
## Sample 46 - 5519 reads in 1114 unique sequences.
## Sample 47 - 3270 reads in 765 unique sequences.
## Sample 48 - 4132 reads in 1007 unique sequences.
## Sample 49 - 12 reads in 8 unique sequences.
## Sample 50 - 20 reads in 12 unique sequences.
## Sample 51 - 22 reads in 14 unique sequences.
## Sample 52 - 18 reads in 11 unique sequences.
## Sample 53 - 16 reads in 12 unique sequences.
## Sample 54 - 16 reads in 9 unique sequences.
dadaRs <- dada(derepRs, err=errR, multithread=TRUE)
## Sample 1 - 4890 reads in 1098 unique sequences.
## Sample 2 - 3605 reads in 942 unique sequences.
## Sample 3 - 2767 reads in 1025 unique sequences.
## Sample 4 - 3159 reads in 805 unique sequences.
## Sample 5 - 6747 reads in 1283 unique sequences.
## Sample 6 - 7176 reads in 1436 unique sequences.
## Sample 7 - 6677 reads in 1654 unique sequences.
## Sample 8 - 3473 reads in 788 unique sequences.
## Sample 9 - 5680 reads in 1078 unique sequences.
## Sample 10 - 3651 reads in 930 unique sequences.
## Sample 11 - 4688 reads in 1556 unique sequences.
## Sample 12 - 4987 reads in 1099 unique sequences.
## Sample 13 - 6646 reads in 1305 unique sequences.
## Sample 14 - 3522 reads in 819 unique sequences.
## Sample 15 - 3943 reads in 1452 unique sequences.
## Sample 16 - 3954 reads in 846 unique sequences.
## Sample 17 - 9461 reads in 1719 unique sequences.
## Sample 18 - 7529 reads in 1404 unique sequences.
## Sample 19 - 7225 reads in 2973 unique sequences.
## Sample 20 - 5949 reads in 1029 unique sequences.
## Sample 21 - 7020 reads in 1465 unique sequences.
## Sample 22 - 6754 reads in 1378 unique sequences.
## Sample 23 - 6122 reads in 2014 unique sequences.
## Sample 24 - 5234 reads in 1018 unique sequences.
## Sample 25 - 6792 reads in 1381 unique sequences.
## Sample 26 - 6198 reads in 1248 unique sequences.
## Sample 27 - 6418 reads in 2277 unique sequences.
## Sample 28 - 4429 reads in 985 unique sequences.
## Sample 29 - 4665 reads in 1101 unique sequences.
## Sample 30 - 5781 reads in 1243 unique sequences.
## Sample 31 - 4360 reads in 1458 unique sequences.
## Sample 32 - 4059 reads in 967 unique sequences.
## Sample 33 - 9289 reads in 2014 unique sequences.
## Sample 34 - 9129 reads in 1700 unique sequences.
## Sample 35 - 4812 reads in 1887 unique sequences.
## Sample 36 - 5743 reads in 1189 unique sequences.
## Sample 37 - 7434 reads in 1741 unique sequences.
## Sample 38 - 6840 reads in 1301 unique sequences.
## Sample 39 - 6173 reads in 1524 unique sequences.
## Sample 40 - 3274 reads in 870 unique sequences.
## Sample 41 - 6912 reads in 1345 unique sequences.
## Sample 42 - 7038 reads in 1375 unique sequences.
## Sample 43 - 6448 reads in 1701 unique sequences.
## Sample 44 - 5235 reads in 1599 unique sequences.
## Sample 45 - 1214 reads in 413 unique sequences.
## Sample 46 - 5519 reads in 1131 unique sequences.
## Sample 47 - 3270 reads in 734 unique sequences.
## Sample 48 - 4132 reads in 1049 unique sequences.
## Sample 49 - 12 reads in 10 unique sequences.
## Sample 50 - 20 reads in 13 unique sequences.
## Sample 51 - 22 reads in 15 unique sequences.
## Sample 52 - 18 reads in 11 unique sequences.
## Sample 53 - 16 reads in 11 unique sequences.
## Sample 54 - 16 reads in 10 unique sequences.
saveRDS(dadaFs, file = "./16S/dadaFs.RDS")
saveRDS(dadaRs, file = "./16S/dadaRs.RDS")
Inspecting the returned dada-class objects:
dadaFs[[1]]
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1081 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
dadaRs[[1]]
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1098 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
readRDS(file = "./16S/dadaFs.RDS")
## $`141_1_N_N`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1081 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_1_N_S`
## dada-class: object describing DADA2 denoising results
## 10 sequence variants were inferred from 978 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_1_R_N`
## dada-class: object describing DADA2 denoising results
## 30 sequence variants were inferred from 1098 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_1_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 821 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_2_N_N`
## dada-class: object describing DADA2 denoising results
## 15 sequence variants were inferred from 1171 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_2_N_S`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1374 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_2_R_N`
## dada-class: object describing DADA2 denoising results
## 34 sequence variants were inferred from 1740 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_2_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 782 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_5_N_N`
## dada-class: object describing DADA2 denoising results
## 15 sequence variants were inferred from 1029 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_5_N_S`
## dada-class: object describing DADA2 denoising results
## 8 sequence variants were inferred from 897 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_5_R_N`
## dada-class: object describing DADA2 denoising results
## 52 sequence variants were inferred from 1691 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_5_R_S`
## dada-class: object describing DADA2 denoising results
## 8 sequence variants were inferred from 1071 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_7_N_N`
## dada-class: object describing DADA2 denoising results
## 14 sequence variants were inferred from 1293 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_7_N_S`
## dada-class: object describing DADA2 denoising results
## 12 sequence variants were inferred from 863 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_7_R_N`
## dada-class: object describing DADA2 denoising results
## 44 sequence variants were inferred from 1499 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_7_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 832 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_10_N_N`
## dada-class: object describing DADA2 denoising results
## 23 sequence variants were inferred from 1658 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_10_N_S`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1300 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_10_R_N`
## dada-class: object describing DADA2 denoising results
## 88 sequence variants were inferred from 3068 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_10_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 985 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_5_N_N`
## dada-class: object describing DADA2 denoising results
## 19 sequence variants were inferred from 1381 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_5_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1281 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_5_R_N`
## dada-class: object describing DADA2 denoising results
## 50 sequence variants were inferred from 2109 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_5_R_S`
## dada-class: object describing DADA2 denoising results
## 10 sequence variants were inferred from 1064 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_6_N_N`
## dada-class: object describing DADA2 denoising results
## 20 sequence variants were inferred from 1273 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_6_N_S`
## dada-class: object describing DADA2 denoising results
## 8 sequence variants were inferred from 1188 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_6_R_N`
## dada-class: object describing DADA2 denoising results
## 62 sequence variants were inferred from 2419 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_6_R_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 907 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_9_N_N`
## dada-class: object describing DADA2 denoising results
## 26 sequence variants were inferred from 1099 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_9_N_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1221 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_9_R_N`
## dada-class: object describing DADA2 denoising results
## 39 sequence variants were inferred from 1508 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_9_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 884 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_10_N_N`
## dada-class: object describing DADA2 denoising results
## 39 sequence variants were inferred from 2059 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_10_N_S`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1622 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_10_R_N`
## dada-class: object describing DADA2 denoising results
## 59 sequence variants were inferred from 2041 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_10_R_S`
## dada-class: object describing DADA2 denoising results
## 16 sequence variants were inferred from 1207 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_4_N_N`
## dada-class: object describing DADA2 denoising results
## 34 sequence variants were inferred from 2192 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_4_N_S`
## dada-class: object describing DADA2 denoising results
## 9 sequence variants were inferred from 1247 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_4_R_N`
## dada-class: object describing DADA2 denoising results
## 28 sequence variants were inferred from 1535 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_4_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 827 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_8_N_N`
## dada-class: object describing DADA2 denoising results
## 9 sequence variants were inferred from 1231 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_8_N_S`
## dada-class: object describing DADA2 denoising results
## 10 sequence variants were inferred from 1340 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_8_R_N`
## dada-class: object describing DADA2 denoising results
## 23 sequence variants were inferred from 1646 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_8_R_S`
## dada-class: object describing DADA2 denoising results
## 31 sequence variants were inferred from 1432 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_9_N_N`
## dada-class: object describing DADA2 denoising results
## 23 sequence variants were inferred from 413 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_9_N_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1114 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_9_R_N`
## dada-class: object describing DADA2 denoising results
## 16 sequence variants were inferred from 765 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_9_R_S`
## dada-class: object describing DADA2 denoising results
## 14 sequence variants were inferred from 1007 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $KIT_NEG_CTRL
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 8 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $KIT_NEG_CTRL2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 12 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $WATER_NEG_CTRL
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 14 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $WATER_NEG_CTRL2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 11 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $WaterFG
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 12 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $WaterFG2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 9 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
readRDS(file = "./16S/dadaRs.RDS")
## $`141_1_N_N`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1098 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_1_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 942 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_1_R_N`
## dada-class: object describing DADA2 denoising results
## 27 sequence variants were inferred from 1025 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_1_R_S`
## dada-class: object describing DADA2 denoising results
## 4 sequence variants were inferred from 805 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_2_N_N`
## dada-class: object describing DADA2 denoising results
## 12 sequence variants were inferred from 1283 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_2_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1436 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_2_R_N`
## dada-class: object describing DADA2 denoising results
## 38 sequence variants were inferred from 1654 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_2_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 788 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_5_N_N`
## dada-class: object describing DADA2 denoising results
## 13 sequence variants were inferred from 1078 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_5_N_S`
## dada-class: object describing DADA2 denoising results
## 8 sequence variants were inferred from 930 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_5_R_N`
## dada-class: object describing DADA2 denoising results
## 46 sequence variants were inferred from 1556 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_5_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1099 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_7_N_N`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1305 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_7_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 819 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_7_R_N`
## dada-class: object describing DADA2 denoising results
## 49 sequence variants were inferred from 1452 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`141_7_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 846 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_10_N_N`
## dada-class: object describing DADA2 denoising results
## 20 sequence variants were inferred from 1719 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_10_N_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1404 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_10_R_N`
## dada-class: object describing DADA2 denoising results
## 89 sequence variants were inferred from 2973 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_10_R_S`
## dada-class: object describing DADA2 denoising results
## 4 sequence variants were inferred from 1029 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_5_N_N`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1465 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_5_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1378 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_5_R_N`
## dada-class: object describing DADA2 denoising results
## 48 sequence variants were inferred from 2014 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_5_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1018 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_6_N_N`
## dada-class: object describing DADA2 denoising results
## 16 sequence variants were inferred from 1381 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_6_N_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1248 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_6_R_N`
## dada-class: object describing DADA2 denoising results
## 66 sequence variants were inferred from 2277 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_6_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 985 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_9_N_N`
## dada-class: object describing DADA2 denoising results
## 20 sequence variants were inferred from 1101 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_9_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1243 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_9_R_N`
## dada-class: object describing DADA2 denoising results
## 42 sequence variants were inferred from 1458 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`717_9_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 967 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_10_N_N`
## dada-class: object describing DADA2 denoising results
## 35 sequence variants were inferred from 2014 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_10_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1700 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_10_R_N`
## dada-class: object describing DADA2 denoising results
## 57 sequence variants were inferred from 1887 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_10_R_S`
## dada-class: object describing DADA2 denoising results
## 14 sequence variants were inferred from 1189 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_4_N_N`
## dada-class: object describing DADA2 denoising results
## 32 sequence variants were inferred from 1741 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_4_N_S`
## dada-class: object describing DADA2 denoising results
## 9 sequence variants were inferred from 1301 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_4_R_N`
## dada-class: object describing DADA2 denoising results
## 25 sequence variants were inferred from 1524 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_4_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 870 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_8_N_N`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 1345 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_8_N_S`
## dada-class: object describing DADA2 denoising results
## 8 sequence variants were inferred from 1375 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_8_R_N`
## dada-class: object describing DADA2 denoising results
## 20 sequence variants were inferred from 1701 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_8_R_S`
## dada-class: object describing DADA2 denoising results
## 29 sequence variants were inferred from 1599 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_9_N_N`
## dada-class: object describing DADA2 denoising results
## 22 sequence variants were inferred from 413 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_9_N_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1131 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_9_R_N`
## dada-class: object describing DADA2 denoising results
## 13 sequence variants were inferred from 734 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $`733_9_R_S`
## dada-class: object describing DADA2 denoising results
## 14 sequence variants were inferred from 1049 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $KIT_NEG_CTRL
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 10 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $KIT_NEG_CTRL2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 13 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $WATER_NEG_CTRL
## dada-class: object describing DADA2 denoising results
## 3 sequence variants were inferred from 15 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $WATER_NEG_CTRL2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 11 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $WaterFG
## dada-class: object describing DADA2 denoising results
## 2 sequence variants were inferred from 11 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
##
## $WaterFG2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 10 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
We now merge the forward and reverse reads together to obtain the full denoised sequences. Merging is performed by aligning the denoised forward reads with the reverse-complement of the corresponding denoised reverse reads, and then constructing the merged “contig” sequences. By default, merged sequences are only output if the forward and reverse reads overlap by at least 12 bases, and are identical to each other in the overlap region (but these conditions can be changed via function arguments).
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
## 4781 paired-reads (in 12 unique pairings) successfully merged out of 4828 (in 23 pairings) input.
## 3520 paired-reads (in 13 unique pairings) successfully merged out of 3556 (in 18 pairings) input.
## 2408 paired-reads (in 29 unique pairings) successfully merged out of 2588 (in 130 pairings) input.
## 3128 paired-reads (in 5 unique pairings) successfully merged out of 3143 (in 7 pairings) input.
## 6641 paired-reads (in 19 unique pairings) successfully merged out of 6691 (in 32 pairings) input.
## 7104 paired-reads (in 12 unique pairings) successfully merged out of 7131 (in 18 pairings) input.
## 6338 paired-reads (in 41 unique pairings) successfully merged out of 6483 (in 121 pairings) input.
## 3436 paired-reads (in 5 unique pairings) successfully merged out of 3449 (in 9 pairings) input.
## 5584 paired-reads (in 21 unique pairings) successfully merged out of 5634 (in 34 pairings) input.
## 3588 paired-reads (in 8 unique pairings) successfully merged out of 3614 (in 14 pairings) input.
## 4073 paired-reads (in 50 unique pairings) successfully merged out of 4394 (in 228 pairings) input.
## 4940 paired-reads (in 7 unique pairings) successfully merged out of 4958 (in 11 pairings) input.
## 6505 paired-reads (in 18 unique pairings) successfully merged out of 6567 (in 26 pairings) input.
## 3461 paired-reads (in 15 unique pairings) successfully merged out of 3502 (in 26 pairings) input.
## 3512 paired-reads (in 54 unique pairings) successfully merged out of 3728 (in 187 pairings) input.
## 3905 paired-reads (in 4 unique pairings) successfully merged out of 3927 (in 8 pairings) input.
## 9280 paired-reads (in 30 unique pairings) successfully merged out of 9355 (in 61 pairings) input.
## 7454 paired-reads (in 11 unique pairings) successfully merged out of 7480 (in 19 pairings) input.
## 6282 paired-reads (in 98 unique pairings) successfully merged out of 6904 (in 411 pairings) input.
## 5924 paired-reads (in 6 unique pairings) successfully merged out of 5935 (in 11 pairings) input.
## 6875 paired-reads (in 20 unique pairings) successfully merged out of 6938 (in 39 pairings) input.
## 6684 paired-reads (in 9 unique pairings) successfully merged out of 6705 (in 16 pairings) input.
## 5440 paired-reads (in 69 unique pairings) successfully merged out of 5849 (in 247 pairings) input.
## 5181 paired-reads (in 6 unique pairings) successfully merged out of 5193 (in 10 pairings) input.
## 6622 paired-reads (in 22 unique pairings) successfully merged out of 6708 (in 45 pairings) input.
## 6121 paired-reads (in 7 unique pairings) successfully merged out of 6152 (in 14 pairings) input.
## 5606 paired-reads (in 75 unique pairings) successfully merged out of 6134 (in 314 pairings) input.
## 4393 paired-reads (in 6 unique pairings) successfully merged out of 4405 (in 10 pairings) input.
## 4479 paired-reads (in 27 unique pairings) successfully merged out of 4570 (in 71 pairings) input.
## 5703 paired-reads (in 7 unique pairings) successfully merged out of 5730 (in 13 pairings) input.
## 3981 paired-reads (in 45 unique pairings) successfully merged out of 4199 (in 171 pairings) input.
## 4018 paired-reads (in 5 unique pairings) successfully merged out of 4031 (in 10 pairings) input.
## 8966 paired-reads (in 63 unique pairings) successfully merged out of 9165 (in 136 pairings) input.
## 9049 paired-reads (in 12 unique pairings) successfully merged out of 9079 (in 20 pairings) input.
## 3958 paired-reads (in 100 unique pairings) successfully merged out of 4563 (in 373 pairings) input.
## 5653 paired-reads (in 14 unique pairings) successfully merged out of 5681 (in 21 pairings) input.
## 7059 paired-reads (in 57 unique pairings) successfully merged out of 7220 (in 131 pairings) input.
## 6774 paired-reads (in 10 unique pairings) successfully merged out of 6797 (in 16 pairings) input.
## 5922 paired-reads (in 45 unique pairings) successfully merged out of 6026 (in 90 pairings) input.
## 3246 paired-reads (in 5 unique pairings) successfully merged out of 3251 (in 8 pairings) input.
## 6855 paired-reads (in 10 unique pairings) successfully merged out of 6875 (in 17 pairings) input.
## 6963 paired-reads (in 11 unique pairings) successfully merged out of 6989 (in 22 pairings) input.
## 6171 paired-reads (in 36 unique pairings) successfully merged out of 6307 (in 79 pairings) input.
## 5034 paired-reads (in 37 unique pairings) successfully merged out of 5100 (in 77 pairings) input.
## 1052 paired-reads (in 24 unique pairings) successfully merged out of 1117 (in 63 pairings) input.
## 5456 paired-reads (in 5 unique pairings) successfully merged out of 5475 (in 11 pairings) input.
## 3100 paired-reads (in 27 unique pairings) successfully merged out of 3193 (in 55 pairings) input.
## 4083 paired-reads (in 14 unique pairings) successfully merged out of 4097 (in 21 pairings) input.
## 9 paired-reads (in 1 unique pairings) successfully merged out of 9 (in 1 pairings) input.
## 15 paired-reads (in 1 unique pairings) successfully merged out of 15 (in 1 pairings) input.
## 15 paired-reads (in 1 unique pairings) successfully merged out of 15 (in 1 pairings) input.
## 17 paired-reads (in 1 unique pairings) successfully merged out of 17 (in 1 pairings) input.
## 8 paired-reads (in 1 unique pairings) successfully merged out of 8 (in 1 pairings) input.
## 13 paired-reads (in 1 unique pairings) successfully merged out of 13 (in 1 pairings) input.
# Inspect the merger data.frame from the first sample
head(mergers[[1]])
## sequence
## 1 AATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGAGTGATGAAGGCCCTAGGGTTGTAAAGCTCTTTCACCGGTGAAGATAATGACGGTAACCGGAGAAGAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGTTAAGTGAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTCATACTGGCAATCTAGAGTCCAGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCTGGAACTGACGCTGAGGTGCGAAAGC
## 2 AATGGGCGAAAGCCTGACGGAGCAATGCCGCGTGGAGGTAAAAGGCCTACGGGTCATGAACTTCTTTTCCTGGAGAAGAAACAATGACGGTATCCGGGGAATAAGCATCGGCTAACTCTGTGCCAGCAGCCGCGGTAAGACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGTCTGTAGGTGGCTTTTTAAGTTCGCCGTCAAATCCCAGGGCTCAACCCTGGACAGGTGGTGAAAACTACTAAGCTAGAGTACGGTAGGGGCAGAGGGAATTTCCGGTGGAGCGGTGAAATGCGTAGAGATCGGAAGGAACACCAATGGCGAAAGCACTCTGCTGGGCCGACACTGACACTGAGAGACGAAAGC
## 3 AATGGGCGAAAGCCCGATCCAGCAATATCGCGTGAGTGAAGAAGGGCAATGCCGCTTGTAAAGCTCTTTCGTCGAGTGCGCGATCATGACAGGACTCGAGGAAGAAGCCCCGGCTAACTCCGTGCCAGCAGCCGCGGTAAGACGGGGGGGGCAAGTGTTCTTCGGAATGACTGGGCGTAAAGGGCACGTAGGCGGTGAATCGGGTTGAAAGTTAAAGTCGCCAAAAACTGGTGGAATGCTCTCGAAACCAATTCACTTGAGTGAGACAGAGGAGAGTGGAATTTCGTGTGTAGGGGTGAAATCCGCAGATCTACGAAGGAACGCCAAAAGCGAAGGCAGCTCTCTGGGTCCCTACCGACGCTGGAGTGCGAAAGC
## 4 AATGGGCGAAAGCCCGATCCAGCAATATCGCGTGAGTGAAGAAGGGCAATGCCGCTTGTAAAGCTCTTTCGTCGAGTGCGCGATCATGACAGGACTCGAGGAAGAAGCCCCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGTTAAGTGAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTCATACTGGCAATCTAGAGTCCAGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCTGGAACTGACGCTGAGGTGCGAAAGC
## 5 AATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGAGTGATGAAGGCCCTAGGGTTGTAAAGCTCTTTCACCGGTGAAGATAATGACGGTAACCGGAGAAGAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAAGACGGGGGGGGCAAGTGTTCTTCGGAATGACTGGGCGTAAAGGGCACGTAGGCGGTGAATCGGGTTGAAAGTTAAAGTCGCCAAAAACTGGTGGAATGCTCTCGAAACCAATTCACTTGAGTGAGACAGAGGAGAGTGGAATTTCGTGTGTAGGGGTGAAATCCGCAGATCTACGAAGGAACGCCAAAAGCGAAGGCAGCTCTCTGGGTCCCTACCGACGCTGGAGTGCGAAAGC
## 7 AATGGGCGAAAGCCTGACGGAGCAATGCCGCGTGGAGGTAAAAGGCCTACGGGTCATGAACTTCTTTTCCTGGAGAAGAAACAATGACGGTATCCGGGGAATAAGCATCGGCTAACTCTGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGTTAAGTGAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTCATACTGGCAATCTAGAGTCCAGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCTGGAACTGACGCTGAGGTGCGAAAGC
## abundance forward reverse nmatch nmismatch nindel prefer accept
## 1 3236 1 1 60 0 0 1 TRUE
## 2 919 2 2 58 0 0 1 TRUE
## 3 497 3 3 58 0 0 1 TRUE
## 4 36 4 1 56 0 0 2 TRUE
## 5 27 6 3 62 0 0 2 TRUE
## 7 17 7 1 58 0 0 2 TRUE
The mergers object is a list of data.frames from each sample. Each data.frame contains the merged sequence, its abundance, and the indices of the forward and reverse sequence variants that were merged. Paired reads that did not exactly overlap were removed by mergePairs, further reducing spurious output.
We can now construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
## [1] 54 525
# Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab)))
##
## 371 372 373 374 375 376 377 378 389 390 391 392 393 394 395 396 397 398 399
## 6 1 200 4 60 1 17 4 1 1 1 4 11 1 7 3 4 195 4
The sequence table is a matrix with rows corresponding to (and named by) the samples, and columns corresponding to (and named by) the sequence variants.
The core dada method corrects substitution and indel errors, but chimeras remain. Fortunately, the accuracy of sequence variants after denoising makes identifying chimeric ASVs simpler than when dealing with fuzzy OTUs. Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences.
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
## Identified 277 bimeras out of 525 input sequences.
dim(seqtab.nochim)
## [1] 54 248
sum(seqtab.nochim)/sum(seqtab)
## [1] 0.9802173
write.csv(seqtab.nochim, "./16S/sequence_table.csv")
Here chimeras make up about 2% of the merged sequence variants.
As a final check of our progress, we’ll look at the number of reads that made it through each step in the pipeline:
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
track
## input filtered denoisedF denoisedR merged nonchim
## 141_1_N_N 5473 4890 4861 4848 4781 4663
## 141_1_N_S 3995 3605 3579 3580 3520 3377
## 141_1_R_N 3115 2767 2645 2648 2408 2372
## 141_1_R_S 3535 3159 3153 3146 3128 3095
## 141_2_N_N 7526 6747 6716 6710 6641 6476
## 141_2_N_S 8069 7176 7157 7145 7104 7036
## 141_2_R_N 7526 6677 6540 6580 6338 6267
## 141_2_R_S 3975 3473 3464 3453 3436 3436
## 141_5_N_N 6377 5680 5655 5651 5584 5408
## 141_5_N_S 4097 3651 3620 3639 3588 3579
## 141_5_R_N 5271 4688 4497 4538 4073 3950
## 141_5_R_S 5682 4987 4973 4962 4940 4926
## 141_7_N_N 7335 6646 6601 6599 6505 6318
## 141_7_N_S 3881 3522 3517 3506 3461 3344
## 141_7_R_N 4423 3943 3794 3841 3512 3444
## 141_7_R_S 4497 3954 3932 3942 3905 3905
## 717_10_N_N 10464 9461 9393 9400 9280 9091
## 717_10_N_S 8360 7529 7515 7486 7454 7352
## 717_10_R_N 8229 7225 7023 7046 6282 6217
## 717_10_R_S 6610 5949 5943 5935 5924 5920
## 717_5_N_N 7726 7020 6995 6950 6875 6754
## 717_5_N_S 7510 6754 6727 6727 6684 6648
## 717_5_R_N 6888 6122 5926 6002 5440 5210
## 717_5_R_S 5998 5234 5216 5199 5181 5172
## 717_6_N_N 7647 6792 6762 6731 6622 6486
## 717_6_N_S 6983 6198 6181 6167 6121 6106
## 717_6_R_N 7250 6418 6207 6303 5606 5546
## 717_6_R_S 5063 4429 4414 4411 4393 4393
## 717_9_N_N 5144 4665 4610 4613 4479 4386
## 717_9_N_S 6497 5781 5743 5761 5703 5695
## 717_9_R_N 4916 4360 4249 4276 3981 3919
## 717_9_R_S 4581 4059 4037 4047 4018 4018
## 733_10_N_N 10293 9289 9212 9230 8966 8464
## 733_10_N_S 10237 9129 9110 9088 9049 8981
## 733_10_R_N 5429 4812 4678 4663 3958 3352
## 733_10_R_S 6413 5743 5701 5700 5653 5636
## 733_4_N_N 8426 7434 7281 7345 7059 6601
## 733_4_N_S 7478 6840 6814 6815 6774 6762
## 733_4_R_N 7036 6173 6094 6096 5922 5572
## 733_4_R_S 3665 3274 3260 3255 3246 3246
## 733_8_N_N 7841 6912 6888 6884 6855 6797
## 733_8_N_S 7764 7038 7007 7007 6963 6905
## 733_8_R_N 7260 6448 6363 6370 6171 5950
## 733_8_R_S 6018 5235 5159 5149 5034 4932
## 733_9_N_N 1366 1214 1150 1156 1052 1038
## 733_9_N_S 6092 5519 5487 5498 5456 5450
## 733_9_R_N 3648 3270 3225 3234 3100 2971
## 733_9_R_S 4583 4132 4107 4108 4083 4070
## KIT_NEG_CTRL 24 12 9 9 9 9
## KIT_NEG_CTRL2 35 20 16 15 15 15
## WATER_NEG_CTRL 35 22 15 22 15 15
## WATER_NEG_CTRL2 48 18 17 17 17 17
## WaterFG 27 16 8 11 8 8
## WaterFG2 28 16 13 13 13 13
Looks good! We kept the majority of our raw reads, and there is no over-large drop associated with any single step after filtering.
It is common at this point, especially in 16S/18S/ITS amplicon sequencing, to assign taxonomy to the sequence variants. The DADA2 package provides a native implementation of the naive Bayesian classifier method for this purpose. The assignTaxonomy function takes as input a set of sequences to be classified and a training set of reference sequences with known taxonomy, and outputs taxonomic assignments with at least minBoot bootstrap confidence.
The recently developed IdTaxa taxonomic classification method is also available via the DECIPHER Bioconductor package. The paper introducing the IDTAXA algorithm reports classification performance that is better than the long-time standard set by the naive Bayesian classifier. Here we include a code block that allows you to use IdTaxa as a drop-in replacement for assignTaxonomy (and it’s faster as well!). Trained classifiers are available from http://DECIPHER.codes/Downloads.html.
dna <- DNAStringSet(getSequences(seqtab.nochim)) # Create a DNAStringSet from the ASVs
load("./16S/SILVA_SSU_r138_2019.RData")
ids <- IdTaxa(dna, trainingSet, strand="both", processors=NULL, verbose=FALSE) # use all processors
ranks <- c("domain", "phylum", "class", "order", "family", "genus", "species") # ranks of interest
# Convert the output object of class "Taxa" to a matrix analogous to the output from assignTaxonomy
taxid <- t(sapply(ids, function(x) {
m <- match(ranks, x$rank)
taxa <- x$taxon[m]
taxa[startsWith(taxa, "unclassified_")] <- NA
taxa
}))
colnames(taxid) <- ranks; rownames(taxid) <- getSequences(seqtab.nochim)
colnames(taxid) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
taxid[taxid == "Ensifer"] <- "Sinorhizobium"
taxa <- taxid
The phyloseq R package is a powerful framework for further analysis of microbiome data.
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE),
tax_table(taxa))
It is more convenient to use short names for our ASVs (e.g. ASV21) rather than the full DNA sequence when working with some of the tables and visualizations from phyloseq, but we want to keep the full DNA sequences for other purposes like merging with other datasets or indexing into reference databases like the Earth Microbiome Project. For that reason we’ll store the DNA sequences of our ASVs in the refseq slot of the phyloseq object, and then rename our taxa to a short string. That way, the short new taxa names will appear in tables and plots, and we can still recover the DNA sequences corresponding to each ASV as needed with refseq(ps).
dna <- Biostrings::DNAStringSet(taxa_names(ps))
names(dna) <- taxa_names(ps)
ps <- merge_phyloseq(ps, dna)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
write.csv(otu_table(ps), "./16S/asv_table.csv")
writeXStringSet(refseq(ps), "./16S/ASV.fasta")
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 248 taxa and 54 samples ]
## tax_table() Taxonomy Table: [ 248 taxa by 7 taxonomic ranks ]
## refseq() DNAStringSet: [ 248 reference sequences ]
Using the sample_data function, I can add a dataframe (here, a csv I uploaded named ‘metadata’) of non-numerical variables to categorize my data even further. To do this, I need to make sure that the rownames are the exact same as the rownames in the otu_table for the phyloseq (ps) object I am adding it to.
Upload and create modified dataframe with categories:
seqtab.sample_data <- read.csv("./16S_metadata.csv")
seqtab.sample_data2 <- seqtab.sample_data[,-1]
rownames(seqtab.sample_data2) <- seqtab.sample_data[,1]
Now add it to my phyloseq object:
sample_data(ps) <- seqtab.sample_data2
Remove controls from dataset:
ps <- prune_samples(sample_names(ps) != "KIT_NEG_CTRL", ps)
ps <- prune_samples(sample_names(ps) != "KIT_NEG_CTRL2", ps)
ps <- prune_samples(sample_names(ps) != "WATER_NEG_CTRL", ps)
ps <- prune_samples(sample_names(ps) != "WATER_NEG_CTRL2", ps)
ps <- prune_samples(sample_names(ps) != "WaterFG", ps)
ps <- prune_samples(sample_names(ps) != "WaterFG2", ps)
Additionally, there was an error made in labeling two samples. Sample
ID “733_8_N_N” in this dataset actually contains data for 733_8_R_S, and
sample ID “733-733_8_R_S-R-S” contains data for 733_8_N_N-8-N-N.
I.e. the two samples were switched when dispensing DNA into wells
before submission. So I wil lwhich the contents of these rows in the
sample_data for the ps object:
# Run only ONCE!
row1_index <- "733_8_N_N"
row2_index <- "733_8_R_S"
row1_data <- sample_data(ps)[row1_index, ]
row2_data <- sample_data(ps)[row2_index, ]
sample_data(ps)[row1_index, ] <- row2_data
sample_data(ps)[row2_index, ] <- row1_data
sample_data(ps)[row1_index, ]
## Inoculum Tissue Surface Group
## 733_8_N_N 733B + 141 Root Surface-sterilized Root, surface-sterilized
sample_data(ps)[row2_index, ]
## Inoculum Tissue Surface
## 733_8_R_S 733B + 141 Nodule Not surface-sterilized
## Group
## 733_8_R_S Nodule, not surface-sterilized
ps.organelle.rm <- ps
ps.organelle.rm <- subset_taxa(ps.organelle.rm, Order!="Rickettsiales")
ps.organelle.rm <- subset_taxa(ps.organelle.rm, Order!="Chloroplast")
ps.organelle.rm <- subset_taxa(ps.organelle.rm, is.na(Domain) == FALSE)
ps.organelle.rm.newnames <- ps.organelle.rm
taxa_names(ps.organelle.rm.newnames) <- paste0(taxa_names(ps.organelle.rm.newnames), " (", data.frame(tax_table(ps.organelle.rm.newnames))$Family, "; ", data.frame(tax_table(ps.organelle.rm.newnames))$Genus, ")")
counts <- otu_table(ps.organelle.rm.newnames)
metadata <- sample_data(ps.organelle.rm.newnames)
metadata <- data.frame(metadata)
metadata <- subset(metadata, select = -c(Group))
metadata$Tissue <- factor(metadata$Tissue, levels = c("Root", "Nodule"))
metadata$Surface <- factor(metadata$Surface, levels = c("Not surface-sterilized", "Surface-sterilized"))
metadata$Inoculum <- factor(metadata$Inoculum, levels = c("141", "717A + 141", "733B + 141"))
sampleorder <- row.names(metadata[order(metadata$Inoculum, metadata$Tissue, metadata$Surface), ]) # Set sample order
metadata <- metadata[sampleorder, ] # Reorder metadata table by sample order
counts <- counts[sampleorder ,] # Reorder matrix by sample order
counts <- t(counts) # Transpose matrix to flip rows and columns
counts <- data.frame(counts) # Convert from OTU table object to dataframe
colnames(counts) <- sub("X", "", colnames(counts)) # Remove "X" from beginning of column names that appeared for some reason
head(counts)
## 141_1_R_N 141_2_R_N 141_5_R_N
## ASV1 (Rhizobiaceae; Sinorhizobium) 0 427 70
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 258 49 291
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 42 114 57
## 141_7_R_N 141_1_R_S 141_2_R_S
## ASV1 (Rhizobiaceae; Sinorhizobium) 15 0 14
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 285 0 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 91 0 0
## 141_5_R_S 141_7_R_S 141_1_N_N
## ASV1 (Rhizobiaceae; Sinorhizobium) 12 9 3236
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 0 0 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 0 0 0
## 141_2_N_N 141_5_N_N 141_7_N_N
## ASV1 (Rhizobiaceae; Sinorhizobium) 4928 4169 4446
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 7 0 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 0 0 0
## 141_1_N_S 141_2_N_S 141_5_N_S
## ASV1 (Rhizobiaceae; Sinorhizobium) 1662 4644 1391
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 0 0 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 0 0 0
## 141_7_N_S 717_10_R_N 717_5_R_N
## ASV1 (Rhizobiaceae; Sinorhizobium) 1650 84 0
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 0 462 35
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 894 11
## ASV8 (Methylophilaceae; Methylotenera) 0 155 247
## 717_6_R_N 717_9_R_N 717_10_R_S
## ASV1 (Rhizobiaceae; Sinorhizobium) 11 17 5576
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 86 88 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 242 97 0
## ASV8 (Methylophilaceae; Methylotenera) 78 45 0
## 717_5_R_S 717_6_R_S 717_9_R_S
## ASV1 (Rhizobiaceae; Sinorhizobium) 20 31 20
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 0 0 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 0 0 0
## 717_10_N_N 717_5_N_N 717_6_N_N
## ASV1 (Rhizobiaceae; Sinorhizobium) 7170 4285 3920
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 31 0 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 17 0 0
## ASV8 (Methylophilaceae; Methylotenera) 0 38 0
## 717_9_N_N 717_10_N_S 717_5_N_S
## ASV1 (Rhizobiaceae; Sinorhizobium) 2317 5666 4131
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 17 0 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 50 0 0
## 717_6_N_S 717_9_N_S 733_10_R_N
## ASV1 (Rhizobiaceae; Sinorhizobium) 4110 2935 0
## ASV4 (Comamonadaceae; NA) 0 0 18
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 0 0 157
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 0 0 20
## 733_4_R_N 733_8_R_N 733_9_R_N
## ASV1 (Rhizobiaceae; Sinorhizobium) 2311 3225 1809
## ASV4 (Comamonadaceae; NA) 785 637 251
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 0 0 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 21 10 5
## 733_10_R_S 733_4_R_S 733_8_N_N
## ASV1 (Rhizobiaceae; Sinorhizobium) 15 502 24
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 0 0 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 11 0 0
## 733_9_R_S 733_10_N_N 733_4_N_N
## ASV1 (Rhizobiaceae; Sinorhizobium) 68 4174 9
## ASV4 (Comamonadaceae; NA) 0 431 452
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 140 2384
## ASV6 (Rhizobiaceae; NA) 0 0 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 0 39 12
## 733_8_R_S 733_9_N_N 733_10_N_S
## ASV1 (Rhizobiaceae; Sinorhizobium) 12 0 5276
## ASV4 (Comamonadaceae; NA) 1251 47 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 13 139 0
## ASV6 (Rhizobiaceae; NA) 0 12 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 202 15 0
## 733_4_N_S 733_8_N_S 733_9_N_S
## ASV1 (Rhizobiaceae; Sinorhizobium) 4935 4877 3928
## ASV4 (Comamonadaceae; NA) 0 0 0
## ASV5 (Pseudomonadaceae; Pseudomonas) 0 0 0
## ASV6 (Rhizobiaceae; NA) 0 0 0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0 0 0
## ASV8 (Methylophilaceae; Methylotenera) 0 0 0
I’ll plot the distributions of a few samples:
ggplot(counts) +
geom_histogram(aes(x = `141_1_N_N`), stat = "bin", bins = 20) +
xlab("ASV abundance") +
ylab("Number of ASVs") +
ggtitle("141_1_N_N")
ggplot(counts) +
geom_histogram(aes(x = `717_5_R_S`), stat = "bin", bins = 20) +
xlab("ASV abundance") +
ylab("Number of ASVs") +
ggtitle("717_5_R_S")
ggplot(counts) +
geom_histogram(aes(x = `733_9_N_S`), stat = "bin", bins = 20) +
xlab("ASV abundance") +
ylab("Number of ASVs") +
ggtitle("733_9_N_S")
These samples are left-skewed. The tutorial shows a very similar pattern and states, “This plot illustrates some common features of RNA-seq count data: a low number of counts associated with a large proportion of genes, a long right tail due to the lack of any upper limit for expression, large dynamic range.” So I guess this is a good sign that this is what the typical input data for DESeq looks like at this stage.
To create a DESeq object from the counts, I need to make sure that the order of column names in the counts table is the same as the order of rownames for the metadata.
# reorder metadata's columns based on row order of the counts
metadata <- metadata[colnames(counts), ]
all(colnames(counts) == rownames(metadata))
## [1] TRUE
Make DESeq object
# Convert 0 values to ones
counts[counts==0] <- 1 # Convert NA's to pseudo-count 1's for rld
dds <- DESeqDataSetFromMatrix(countData=counts,
colData=metadata,
design= ~Inoculum + Tissue + Surface)
## converting counts to integer mode
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
dds
## class: DESeqDataSet
## dim: 204 48
## metadata(1): version
## assays(1): counts
## rownames(204): ASV1 (Rhizobiaceae; Sinorhizobium) ASV4 (Comamonadaceae;
## NA) ... ASV246 (NA; NA) ASV248 (Comamonadaceae; NA)
## rowData names(0):
## colnames(48): 141_1_R_N 141_2_R_N ... 733_8_N_S 733_9_N_S
## colData names(3): Inoculum Tissue Surface
Once the DESeq() function is used later on, it will use
the function estimateSizeFactors() to generate normalized
counts, so I do not have to do it manually here.
To explore the similarity of my samples, I will perform sample-level QC using Principal Component Analysis (PCA) and hierarchical clustering methods. The sample-level QC allows me to see how well the replicates cluster together, as well as, observe whether the experimental condition represents the major source of variation in the data. Performing sample-level QC can also identify any sample outliers, which may need to be explored further to determine whether they need to be removed prior to DE analysis.
dds_QC <- DESeq(dds, test="LRT", reduced=~1)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
dds_filt <- DGEList(counts=counts(dds_QC), group=metadata$Inoculum:metadata$Tissue:metadata$Surface)
keep <- filterByExpr(dds_filt)
table(keep)
## keep
## TRUE
## 204
None of them need to be removed.
Inoculum does not significantly affect ASV abundances.
Full model: design = ~ Inoculum + Tissue + Surface.
Reduced model: design = ~ Tissue + Surface
dds_lrt_full.vs.noInoc <- DESeq(dds, test="LRT", reduced = ~ Tissue + Surface)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
res_LRT_full.vs.noInoc <- results(dds_lrt_full.vs.noInoc)
# Create a tibble for LRT results
res_LRT_full.vs.noInoc_tb <- res_LRT_full.vs.noInoc %>%
data.frame() %>%
rownames_to_column(var="ASV") %>%
as_tibble()
write.csv(res_LRT_full.vs.noInoc_tb, "./16S/LRT_test_results_Inoc.csv", row.names = F)
# Subset to return ASVs with padj < 0.05
padj.cutoff <- 0.05 # Set alpha to 0.05
sigLRT_ASVs_full.vs.noInoc <- res_LRT_full.vs.noInoc_tb %>%
filter(padj < padj.cutoff)
insigLRT_ASVs_full.vs.noInoc <- res_LRT_full.vs.noInoc_tb %>%
filter(padj >= padj.cutoff)
sigLRT_ASVs_full.vs.noInoc
## # A tibble: 37 × 7
## ASV baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV4 (Comamonadaceae; … 85.5 -2.70 0.573 60.8 6.29e-14 4.84e-12
## 2 ASV5 (Pseudomonadaceae… 59.4 -2.35 0.635 28.7 5.91e- 7 9.10e- 6
## 3 ASV7 (Comamonadaceae; … 27.9 -2.38 0.581 30.8 2.02e- 7 5.19e- 6
## 4 ASV9 (Rhizobiaceae; NA) 23.3 -3.29 0.563 18.7 8.90e- 5 5.61e- 4
## 5 ASV14 (Sphingomonadace… 14.3 -3.61 0.517 17.2 1.88e- 4 9.64e- 4
## 6 ASV15 (Azospirillaceae… 12.8 -2.10 0.565 27.4 1.12e- 6 1.23e- 5
## 7 ASV17 (Rhizobiaceae; A… 10.7 -2.45 0.600 9.63 8.10e- 3 2.01e- 2
## 8 ASV18 (Sphingomonadace… 10.4 -1.94 0.581 21.6 2.03e- 5 1.95e- 4
## 9 ASV20 (Sphingomonadace… 9.18 -2.62 0.514 14.8 6.23e- 4 2.67e- 3
## 10 ASV21 (Devosiaceae; De… 9.01 -2.83 0.523 11.0 4.10e- 3 1.22e- 2
## # ℹ 27 more rows
nrow(sigLRT_ASVs_full.vs.noInoc)
## [1] 37
insigLRT_ASVs_full.vs.noInoc
## # A tibble: 40 × 7
## ASV baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorh… 2154. -0.359 0.687 3.29 0.193 0.232
## 2 ASV6 (Rhizobiaceae; NA) 39.2 -4.89 0.563 3.47 0.176 0.230
## 3 ASV8 (Methylophilaceae; Me… 27.8 -5.11 0.535 2.86 0.239 0.282
## 4 ASV11 (Azospirillaceae; Az… 21.8 -4.80 0.574 2.37 0.306 0.337
## 5 ASV16 (Caulobacteraceae; P… 11.6 -3.71 0.509 1.44 0.486 0.519
## 6 ASV19 (Sphingomonadaceae; … 9.25 -3.25 0.508 1.51 0.470 0.509
## 7 ASV22 (Rhodocyclaceae; Dec… 8.86 -3.11 0.513 0.0723 0.965 0.965
## 8 ASV25 (Enterobacteriaceae;… 5.84 2.68 0.503 3.71 0.157 0.219
## 9 ASV29 (Rhodobacteraceae; N… 5.34 -2.95 0.511 0.0705 0.965 0.965
## 10 ASV33 (Microscillaceae; Oh… 4.96 -2.22 0.544 4.47 0.107 0.165
## # ℹ 30 more rows
nrow(insigLRT_ASVs_full.vs.noInoc)
## [1] 40
res_inoc_717A_vs_141 <- results(dds_lrt_full.vs.noInoc, contrast = c("Inoculum", "717A + 141", "141"), alpha = 0.05) # Baseline is 141 inoculum
head(res_inoc_717A_vs_141 %>% as.data.frame())
## baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.84652 1.648332850
## ASV4 (Comamonadaceae; NA) 85.46005 -0.004799983
## ASV5 (Pseudomonadaceae; Pseudomonas) 59.39209 -0.004441811
## ASV6 (Rhizobiaceae; NA) 39.20911 0.039710119
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 27.92732 4.195912406
## ASV8 (Methylophilaceae; Methylotenera) 27.80852 0.831357561
## lfcSE stat pvalue
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.8419657 3.294766 1.925532e-01
## ASV4 (Comamonadaceae; NA) 0.7565939 60.794638 6.289443e-14
## ASV5 (Pseudomonadaceae; Pseudomonas) 0.8232654 28.683774 5.907416e-07
## ASV6 (Rhizobiaceae; NA) 0.6547935 3.470635 1.763442e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.6960142 30.829545 2.020456e-07
## ASV8 (Methylophilaceae; Methylotenera) 0.6416827 2.864414 2.387813e-01
## padj
## ASV1 (Rhizobiaceae; Sinorhizobium) 2.709734e-01
## ASV4 (Comamonadaceae; NA) 5.660499e-12
## ASV5 (Pseudomonadaceae; Pseudomonas) 1.063335e-05
## ASV6 (Rhizobiaceae; NA) 2.689997e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 6.061368e-06
## ASV8 (Methylophilaceae; Methylotenera) 3.203821e-01
# Apply fold change shrinkage
res_inoc_717A_vs_141 <- lfcShrink(dds_lrt_full.vs.noInoc, coef="Inoculum_717A...141_vs_141")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_inoc_717A_vs_141
## log2 fold change (MAP): Inoculum 717A...141 vs 141
## LRT p-value: '~ Inoculum + Tissue + Surface' vs '~ Tissue + Surface'
## DataFrame with 204 rows and 5 columns
## baseMean log2FoldChange
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.8465 0.52267506
## ASV4 (Comamonadaceae; NA) 85.4601 -0.00114549
## ASV5 (Pseudomonadaceae; Pseudomonas) 59.3921 -0.00441186
## ASV6 (Rhizobiaceae; NA) 39.2091 0.01682279
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 27.9273 3.81775075
## ... ... ...
## ASV243 (NA; NA) 1.05099 -0.00352642
## ASV244 (NA; NA) 1.05099 -0.00352642
## ASV245 (NA; NA) 1.05099 -0.00352642
## ASV246 (NA; NA) 1.05099 -0.00352642
## ASV248 (Comamonadaceae; NA) 1.05099 -0.00352642
## lfcSE pvalue
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.726192 1.92553e-01
## ASV4 (Comamonadaceae; NA) 0.472213 6.28944e-14
## ASV5 (Pseudomonadaceae; Pseudomonas) 0.489396 5.90742e-07
## ASV6 (Rhizobiaceae; NA) 0.433545 1.76344e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.925550 2.02046e-07
## ... ... ...
## ASV243 (NA; NA) 0.390332 0.999962
## ASV244 (NA; NA) 0.390332 0.999962
## ASV245 (NA; NA) 0.390332 0.999962
## ASV246 (NA; NA) 0.390332 0.999962
## ASV248 (Comamonadaceae; NA) 0.390332 0.999962
## padj
## <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 2.31833e-01
## ASV4 (Comamonadaceae; NA) 4.84287e-12
## ASV5 (Pseudomonadaceae; Pseudomonas) 9.09742e-06
## ASV6 (Rhizobiaceae; NA) 2.30144e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 5.18584e-06
## ... ...
## ASV243 (NA; NA) NA
## ASV244 (NA; NA) NA
## ASV245 (NA; NA) NA
## ASV246 (NA; NA) NA
## ASV248 (Comamonadaceae; NA) NA
summary(res_inoc_717A_vs_141, alpha = 0.05)
##
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 27, 13%
## LFC < 0 (down) : 10, 4.9%
## outliers [1] : 20, 9.8%
## low counts [2] : 107, 52%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141 %>%
as.data.frame() %>%
rownames_to_column(var="ASV") %>%
as_tibble()
write.csv(res_inoc_717A_vs_141_tb, "./16S/test_results_Inoc_717A141_vs_141_shrunken_LFC.csv", row.names = F)
head(res_inoc_717A_vs_141_tb)
## # A tibble: 6 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizo… 2154. 0.523 0.726 1.93e- 1 2.32e- 1
## 2 ASV4 (Comamonadaceae; NA) 85.5 -0.00115 0.472 6.29e-14 4.84e-12
## 3 ASV5 (Pseudomonadaceae; Pseud… 59.4 -0.00441 0.489 5.91e- 7 9.10e- 6
## 4 ASV6 (Rhizobiaceae; NA) 39.2 0.0168 0.434 1.76e- 1 2.30e- 1
## 5 ASV7 (Comamonadaceae; Candida… 27.9 3.82 0.926 2.02e- 7 5.19e- 6
## 6 ASV8 (Methylophilaceae; Methy… 27.8 0.403 0.514 2.39e- 1 2.82e- 1
sig_inoc_717A_vs_141 <- res_inoc_717A_vs_141_tb %>% filter(padj < 0.05)
write.csv(sig_inoc_717A_vs_141, "./16S/significant_results_Inoc_717A141_vs_141_shrunken_LFC.csv", row.names = F)
sig_inoc_717A_vs_141
## # A tibble: 37 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV4 (Comamonadaceae; NA) 85.5 -0.00115 0.472 6.29e-14 4.84e-12
## 2 ASV5 (Pseudomonadaceae; Pseu… 59.4 -0.00441 0.489 5.91e- 7 9.10e- 6
## 3 ASV7 (Comamonadaceae; Candid… 27.9 3.82 0.926 2.02e- 7 5.19e- 6
## 4 ASV9 (Rhizobiaceae; NA) 23.3 3.06 0.897 8.90e- 5 5.61e- 4
## 5 ASV14 (Sphingomonadaceae; No… 14.3 2.88 0.818 1.88e- 4 9.64e- 4
## 6 ASV15 (Azospirillaceae; Azos… 12.8 3.17 0.839 1.12e- 6 1.23e- 5
## 7 ASV17 (Rhizobiaceae; Allorhi… 10.7 1.89 0.999 8.10e- 3 2.01e- 2
## 8 ASV18 (Sphingomonadaceae; NA) 10.4 2.80 0.878 2.03e- 5 1.95e- 4
## 9 ASV20 (Sphingomonadaceae; No… 9.18 -0.950 0.620 6.23e- 4 2.67e- 3
## 10 ASV21 (Devosiaceae; Devosia) 9.01 2.04 0.815 4.10e- 3 1.22e- 2
## # ℹ 27 more rows
nrow(sig_inoc_717A_vs_141)
## [1] 37
Of the ASVs whose abundances are significantly affected by inoculum, which ones are differentially abundant in the 733B + 141 inoculua compared to the baseline 141 inoculum?
res_inoc_733B_vs_141 <- results(dds_lrt_full.vs.noInoc, contrast = c("Inoculum", "733B + 141", "141"), alpha = 0.05) # Baseline is 141 inoculum
head(res_inoc_733B_vs_141 %>% as.data.frame())
## baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.84652 1.594262191
## ASV4 (Comamonadaceae; NA) 85.46005 6.548528151
## ASV5 (Pseudomonadaceae; Pseudomonas) 59.39209 4.884055747
## ASV6 (Rhizobiaceae; NA) 39.20911 -1.266132353
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 27.92732 -0.003228623
## ASV8 (Methylophilaceae; Methylotenera) 27.80852 1.151624816
## lfcSE stat pvalue
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.8419685 3.294766 1.925532e-01
## ASV4 (Comamonadaceae; NA) 0.6801262 60.794638 6.289443e-14
## ASV5 (Pseudomonadaceae; Pseudomonas) 0.7632069 28.683774 5.907416e-07
## ASV6 (Rhizobiaceae; NA) 0.6691364 3.470635 1.763442e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.7624188 30.829545 2.020456e-07
## ASV8 (Methylophilaceae; Methylotenera) 0.6374028 2.864414 2.387813e-01
## padj
## ASV1 (Rhizobiaceae; Sinorhizobium) 2.709734e-01
## ASV4 (Comamonadaceae; NA) 5.660499e-12
## ASV5 (Pseudomonadaceae; Pseudomonas) 1.063335e-05
## ASV6 (Rhizobiaceae; NA) 2.689997e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 6.061368e-06
## ASV8 (Methylophilaceae; Methylotenera) 3.203821e-01
# Apply fold change shrinkage
res_inoc_733B_vs_141 <- lfcShrink(dds_lrt_full.vs.noInoc, coef="Inoculum_733B...141_vs_141")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_inoc_733B_vs_141
## log2 fold change (MAP): Inoculum 733B...141 vs 141
## LRT p-value: '~ Inoculum + Tissue + Surface' vs '~ Tissue + Surface'
## DataFrame with 204 rows and 5 columns
## baseMean log2FoldChange
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.8465 0.37234868
## ASV4 (Comamonadaceae; NA) 85.4601 6.32162328
## ASV5 (Pseudomonadaceae; Pseudomonas) 59.3921 4.36706306
## ASV6 (Rhizobiaceae; NA) 39.2091 -0.43785604
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 27.9273 -0.00355456
## ... ... ...
## ASV243 (NA; NA) 1.05099 -0.000570683
## ASV244 (NA; NA) 1.05099 -0.000570683
## ASV245 (NA; NA) 1.05099 -0.000570683
## ASV246 (NA; NA) 1.05099 -0.000570683
## ASV248 (Comamonadaceae; NA) 1.05099 -0.000570683
## lfcSE pvalue
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.594031 1.92553e-01
## ASV4 (Comamonadaceae; NA) 0.875980 6.28944e-14
## ASV5 (Pseudomonadaceae; Pseudomonas) 1.149829 5.90742e-07
## ASV6 (Rhizobiaceae; NA) 0.581588 1.76344e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.405339 2.02046e-07
## ... ... ...
## ASV243 (NA; NA) 0.347941 0.999962
## ASV244 (NA; NA) 0.347941 0.999962
## ASV245 (NA; NA) 0.347941 0.999962
## ASV246 (NA; NA) 0.347941 0.999962
## ASV248 (Comamonadaceae; NA) 0.347941 0.999962
## padj
## <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 2.31833e-01
## ASV4 (Comamonadaceae; NA) 4.84287e-12
## ASV5 (Pseudomonadaceae; Pseudomonas) 9.09742e-06
## ASV6 (Rhizobiaceae; NA) 2.30144e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 5.18584e-06
## ... ...
## ASV243 (NA; NA) NA
## ASV244 (NA; NA) NA
## ASV245 (NA; NA) NA
## ASV246 (NA; NA) NA
## ASV248 (Comamonadaceae; NA) NA
summary(res_inoc_733B_vs_141, alpha = 0.05)
##
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 12, 5.9%
## LFC < 0 (down) : 25, 12%
## outliers [1] : 20, 9.8%
## low counts [2] : 107, 52%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141 %>%
as.data.frame() %>%
rownames_to_column(var="ASV") %>%
as_tibble()
write.csv(res_inoc_733B_vs_141_tb, "./16S/test_results_Inoc_733B141_vs_141_shrunken_LFC.csv", row.names = F)
head(res_inoc_733B_vs_141_tb)
## # A tibble: 6 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizo… 2154. 0.372 0.594 1.93e- 1 2.32e- 1
## 2 ASV4 (Comamonadaceae; NA) 85.5 6.32 0.876 6.29e-14 4.84e-12
## 3 ASV5 (Pseudomonadaceae; Pseud… 59.4 4.37 1.15 5.91e- 7 9.10e- 6
## 4 ASV6 (Rhizobiaceae; NA) 39.2 -0.438 0.582 1.76e- 1 2.30e- 1
## 5 ASV7 (Comamonadaceae; Candida… 27.9 -0.00355 0.405 2.02e- 7 5.19e- 6
## 6 ASV8 (Methylophilaceae; Methy… 27.8 0.453 0.560 2.39e- 1 2.82e- 1
sig_inoc_733B_vs_141 <- res_inoc_733B_vs_141_tb %>% filter(padj < 0.05)
write.csv(sig_inoc_733B_vs_141, "./16S/significant_results_Inoc_733B141_vs_141_shrunken_LFC.csv", row.names = F)
sig_inoc_733B_vs_141
## # A tibble: 37 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV4 (Comamonadaceae; NA) 85.5 6.32 0.876 6.29e-14 4.84e-12
## 2 ASV5 (Pseudomonadaceae; Pseu… 59.4 4.37 1.15 5.91e- 7 9.10e- 6
## 3 ASV7 (Comamonadaceae; Candid… 27.9 -0.00355 0.405 2.02e- 7 5.19e- 6
## 4 ASV9 (Rhizobiaceae; NA) 23.3 0.411 0.592 8.90e- 5 5.61e- 4
## 5 ASV14 (Sphingomonadaceae; No… 14.3 1.30 0.950 1.88e- 4 9.64e- 4
## 6 ASV15 (Azospirillaceae; Azos… 12.8 -0.00101 0.400 1.12e- 6 1.23e- 5
## 7 ASV17 (Rhizobiaceae; Allorhi… 10.7 0.252 0.478 8.10e- 3 2.01e- 2
## 8 ASV18 (Sphingomonadaceae; NA) 10.4 -0.000973 0.402 2.03e- 5 1.95e- 4
## 9 ASV20 (Sphingomonadaceae; No… 9.18 -2.24 0.783 6.23e- 4 2.67e- 3
## 10 ASV21 (Devosiaceae; Devosia) 9.01 0.874 0.872 4.10e- 3 1.22e- 2
## # ℹ 27 more rows
nrow(sig_inoc_733B_vs_141)
## [1] 37
log2FoldChange >= 1 since we are working with log2 fold changes so this translates to an actual fold change of 2
res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141_tb %>% mutate(threshold_717A141 = padj < 0.05 & abs(log2FoldChange) >= 1)
res_inoc_717A_vs_141_tb$label <- res_inoc_717A_vs_141_tb$ASV
res_inoc_717A_vs_141_tb$label <- gsub("[A-Za-z]+; ", "", res_inoc_717A_vs_141_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Function to modify the label based on the ASV column
modify_label <- function(asv_value, label_value) {
if (grepl("\\(NA\\)", label_value)) {
asv_text <- gsub(".*?\\((.*?)\\;.*", "\\1", asv_value)
modified_label <- gsub("NA", asv_text, label_value)
return(modified_label)
} else {
return(label_value)
}
}
# Apply the function to create a new_label column
res_inoc_717A_vs_141_tb$new_label <- mapply(modify_label, res_inoc_717A_vs_141_tb$ASV, res_inoc_717A_vs_141_tb$label)
## Remove the label if it does not meet the threshold
res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141_tb %>% mutate(new_label = case_when(threshold_717A141 == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_inoc_717A_vs_141_tb$new_label_2 <- gsub(" \\(.*", "", res_inoc_717A_vs_141_tb$ASV)
## Remove the label if it does not meet the threshold
res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141_tb %>% mutate(new_label_2 = case_when(threshold_717A141 == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_inoc_717A_vs_141_tb$new_label_4 <- res_inoc_717A_vs_141_tb$new_label_2
res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141_tb %>% mutate(new_label_4 = ifelse(
ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)",
list("ASV5", new_label_2),
ifelse(
ASV == "ASV28 (Bacillaceae; Bacillus)",
list("ASV28", new_label_2),
ifelse(
ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
list("ASV1", new_label_2),
new_label_2
)
)
))
volcano.inoc_717A_vs_141 <- ggplot(res_inoc_717A_vs_141_tb, aes(x = log2FoldChange, y = -log10(padj))) +
geom_point(aes(colour = threshold_717A141)) +
geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
xlab("log2 fold change") +
ylab("-log10 adjusted p-value") +
theme_linedraw() +
ggtitle("Inoculum: 717A + 141 vs 141") +
theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.inoc_717A_vs_141
## Warning: Removed 127 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141_tb %>% mutate(threshold_733B141 = padj < 0.05 & abs(log2FoldChange) >= 1)
res_inoc_733B_vs_141_tb$label <- res_inoc_733B_vs_141_tb$ASV
res_inoc_733B_vs_141_tb$label <- gsub("[A-Za-z]+; ", "", res_inoc_733B_vs_141_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Apply the function to create a new_label column
res_inoc_733B_vs_141_tb$new_label <- mapply(modify_label, res_inoc_733B_vs_141_tb$ASV, res_inoc_733B_vs_141_tb$label)
## Remove the label if it does not meet the threshold
res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141_tb %>% mutate(new_label = case_when(threshold_733B141 == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_inoc_733B_vs_141_tb$new_label_2 <- gsub(" \\(.*", "", res_inoc_733B_vs_141_tb$ASV)
## Remove the label if it does not meet the threshold
res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141_tb %>% mutate(new_label_2 = case_when(threshold_733B141 == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_inoc_733B_vs_141_tb$new_label_4 <- res_inoc_733B_vs_141_tb$new_label_2
res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141_tb %>% mutate(new_label_4 = ifelse(
ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)",
list("ASV5", new_label_2),
ifelse(
ASV == "ASV28 (Bacillaceae; Bacillus)",
list("ASV28", new_label_2),
ifelse(
ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
list("ASV1", new_label_2),
new_label_2
)
)
))
volcano.inoc_733B_vs_141 <- ggplot(res_inoc_733B_vs_141_tb, aes(x = log2FoldChange, y = -log10(padj))) +
geom_point(aes(colour = threshold_733B141)) +
geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
xlab("log2 fold change") +
ylab("-log10 adjusted p-value") +
theme_linedraw() +
ggtitle("Inoculum: 733B + 141 vs 141") +
theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.inoc_733B_vs_141
## Warning: Removed 127 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
Tissue does not significantly affect ASV abundances.
Full model: design = ~ Inoculum + Tissue + Surface.
Reduced model: design = ~ Inoculum + Surface
dds_lrt_full.vs.noTissue <- DESeq(dds, test="LRT", reduced = ~ Inoculum + Surface)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
res_LRT_full.vs.noTissue <- results(dds_lrt_full.vs.noTissue)
# Create a tibble for LRT results
res_LRT_full.vs.noTissue_tb <- res_LRT_full.vs.noTissue %>%
data.frame() %>%
rownames_to_column(var="ASV") %>%
as_tibble()
write.csv(res_LRT_full.vs.noTissue_tb, "./16S/LRT_test_results_Tissue.csv", row.names = F)
# Subset to return ASVs with padj < 0.05
padj.cutoff <- 0.05 # Set alpha to 0.05
sigLRT_ASVs_full.vs.noTissue <- res_LRT_full.vs.noTissue_tb %>%
filter(padj < padj.cutoff)
insigLRT_ASVs_full.vs.noTissue <- res_LRT_full.vs.noTissue_tb %>%
filter(padj >= padj.cutoff)
sigLRT_ASVs_full.vs.noTissue
## # A tibble: 39 × 7
## ASV baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sino… 2154. -0.359 0.687 15.1 1.01e-4 1.23e-3
## 2 ASV5 (Pseudomonadaceae; … 59.4 -2.35 0.635 12.2 4.67e-4 2.54e-3
## 3 ASV6 (Rhizobiaceae; NA) 39.2 -4.89 0.563 16.4 5.13e-5 8.32e-4
## 4 ASV7 (Comamonadaceae; Ca… 27.9 -2.38 0.581 7.88 5.01e-3 1.45e-2
## 5 ASV8 (Methylophilaceae; … 27.8 -5.11 0.535 6.21 1.27e-2 2.61e-2
## 6 ASV9 (Rhizobiaceae; NA) 23.3 -3.29 0.563 10.1 1.52e-3 5.79e-3
## 7 ASV11 (Azospirillaceae; … 21.8 -4.80 0.574 5.71 1.69e-2 3.12e-2
## 8 ASV15 (Azospirillaceae; … 12.8 -2.10 0.565 6.20 1.28e-2 2.61e-2
## 9 ASV16 (Caulobacteraceae;… 11.6 -3.71 0.509 12.9 3.31e-4 2.24e-3
## 10 ASV17 (Rhizobiaceae; All… 10.7 -2.45 0.600 11.0 9.15e-4 3.98e-3
## # ℹ 29 more rows
nrow(sigLRT_ASVs_full.vs.noTissue)
## [1] 39
insigLRT_ASVs_full.vs.noTissue
## # A tibble: 22 × 7
## ASV baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV4 (Comamonadaceae; NA) 85.5 -2.70 0.573 0.103 0.748 0.748
## 2 ASV14 (Sphingomonadaceae; … 14.3 -3.61 0.517 2.31 0.128 0.140
## 3 ASV24 (Comamonadaceae; Pse… 8.08 -2.27 0.479 2.62 0.105 0.118
## 4 ASV28 (Bacillaceae; Bacill… 6.51 -1.91 0.536 2.98 0.0844 0.105
## 5 ASV29 (Rhodobacteraceae; N… 5.34 -2.95 0.511 4.41 0.0358 0.0520
## 6 ASV34 (Hyphomonadaceae; SW… 4.84 -2.84 0.505 3.59 0.0581 0.0770
## 7 ASV42 (Comamonadaceae; Pel… 3.04 -1.50 0.508 0.168 0.682 0.694
## 8 ASV45 (Xanthobacteraceae; … 2.89 -1.54 0.506 2.99 0.0840 0.105
## 9 ASV48 (Rhizobiaceae; NA) 2.79 -1.44 0.460 3.77 0.0522 0.0708
## 10 ASV54 (Microscillaceae; Oh… 2.22 -1.58 0.472 1.46 0.226 0.238
## # ℹ 12 more rows
nrow(insigLRT_ASVs_full.vs.noTissue)
## [1] 22
The differences in abundance of 39 ASVs across inoculum treatment are significant. Of those 39, which ones are differentially abundant in the Nodule samples compared to the Root samples?
res_tissue_Nod_vs_Root <- results(dds_lrt_full.vs.noTissue, contrast = c("Tissue", "Nodule", "Root"), alpha = 0.05) # Baseline is Root
head(res_tissue_Nod_vs_Root %>% as.data.frame())
## baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.84652 3.1847272
## ASV4 (Comamonadaceae; NA) 85.46005 0.2022541
## ASV5 (Pseudomonadaceae; Pseudomonas) 59.39209 2.3732709
## ASV6 (Rhizobiaceae; NA) 39.20911 -2.7495204
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 27.92732 -2.1627127
## ASV8 (Methylophilaceae; Methylotenera) 27.80852 -1.4060265
## lfcSE stat
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.6874251 15.1264452
## ASV4 (Comamonadaceae; NA) 0.5559333 0.1028331
## ASV5 (Pseudomonadaceae; Pseudomonas) 0.6349567 12.2443458
## ASV6 (Rhizobiaceae; NA) 0.5519223 16.4011054
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.5790733 7.8763500
## ASV8 (Methylophilaceae; Methylotenera) 0.5194104 6.2098710
## pvalue padj
## ASV1 (Rhizobiaceae; Sinorhizobium) 1.005449e-04 0.0011663211
## ASV4 (Comamonadaceae; NA) 7.484559e-01 0.7484558750
## ASV5 (Pseudomonadaceae; Pseudomonas) 4.666701e-04 0.0024194971
## ASV6 (Rhizobiaceae; NA) 5.125533e-05 0.0007913209
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 5.008547e-03 0.0138331295
## ASV8 (Methylophilaceae; Methylotenera) 1.270399e-02 0.0247890346
# Apply fold change shrinkage
res_tissue_Nod_vs_Root <- lfcShrink(dds_lrt_full.vs.noTissue, coef="Tissue_Nodule_vs_Root")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_tissue_Nod_vs_Root
## log2 fold change (MAP): Tissue Nodule vs Root
## LRT p-value: '~ Inoculum + Tissue + Surface' vs '~ Inoculum + Surface'
## DataFrame with 204 rows and 5 columns
## baseMean log2FoldChange
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.8465 2.7592776
## ASV4 (Comamonadaceae; NA) 85.4601 0.0883234
## ASV5 (Pseudomonadaceae; Pseudomonas) 59.3921 1.5802816
## ASV6 (Rhizobiaceae; NA) 39.2091 -2.4192687
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 27.9273 -1.5373577
## ... ... ...
## ASV243 (NA; NA) 1.05099 0.00312418
## ASV244 (NA; NA) 1.05099 0.00312418
## ASV245 (NA; NA) 1.05099 0.00312418
## ASV246 (NA; NA) 1.05099 0.00312418
## ASV248 (Comamonadaceae; NA) 1.05099 0.00312418
## lfcSE pvalue
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.852680 1.00545e-04
## ASV4 (Comamonadaceae; NA) 0.405891 7.48456e-01
## ASV5 (Pseudomonadaceae; Pseudomonas) 1.063894 4.66670e-04
## ASV6 (Rhizobiaceae; NA) 0.680140 5.12553e-05
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.832125 5.00855e-03
## ... ... ...
## ASV243 (NA; NA) 0.342704 0.98894
## ASV244 (NA; NA) 0.342704 0.98894
## ASV245 (NA; NA) 0.342704 0.98894
## ASV246 (NA; NA) 0.342704 0.98894
## ASV248 (Comamonadaceae; NA) 0.342704 0.98894
## padj
## <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.001226648
## ASV4 (Comamonadaceae; NA) 0.748455875
## ASV5 (Pseudomonadaceae; Pseudomonas) 0.002544643
## ASV6 (Rhizobiaceae; NA) 0.000832251
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.014548636
## ... ...
## ASV243 (NA; NA) NA
## ASV244 (NA; NA) NA
## ASV245 (NA; NA) NA
## ASV246 (NA; NA) NA
## ASV248 (Comamonadaceae; NA) NA
summary(res_tissue_Nod_vs_Root, alpha = 0.05)
##
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 3, 1.5%
## LFC < 0 (down) : 36, 18%
## outliers [1] : 20, 9.8%
## low counts [2] : 123, 60%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root %>%
as.data.frame() %>%
rownames_to_column(var="ASV") %>%
as_tibble()
write.csv(res_tissue_Nod_vs_Root_tb, "./16S/test_results_Tissue_Nodule_vs_Root_shrunken_LFC.csv", row.names = F)
head(res_tissue_Nod_vs_Root_tb)
## # A tibble: 6 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizobi… 2154. 2.76 0.853 1.01e-4 1.23e-3
## 2 ASV4 (Comamonadaceae; NA) 85.5 0.0883 0.406 7.48e-1 7.48e-1
## 3 ASV5 (Pseudomonadaceae; Pseudom… 59.4 1.58 1.06 4.67e-4 2.54e-3
## 4 ASV6 (Rhizobiaceae; NA) 39.2 -2.42 0.680 5.13e-5 8.32e-4
## 5 ASV7 (Comamonadaceae; Candidatu… 27.9 -1.54 0.832 5.01e-3 1.45e-2
## 6 ASV8 (Methylophilaceae; Methylo… 27.8 -1.01 0.592 1.27e-2 2.61e-2
sig_inoc_Nod_vs_Root <- res_tissue_Nod_vs_Root_tb %>% filter(padj < 0.05)
write.csv(sig_inoc_Nod_vs_Root, "./16S/significant_results_Tissue_Nodule_vs_Root_shrunken_LFC.csv", row.names = F)
sig_inoc_Nod_vs_Root
## # A tibble: 39 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizob… 2154. 2.76 0.853 1.01e-4 1.23e-3
## 2 ASV5 (Pseudomonadaceae; Pseudo… 59.4 1.58 1.06 4.67e-4 2.54e-3
## 3 ASV6 (Rhizobiaceae; NA) 39.2 -2.42 0.680 5.13e-5 8.32e-4
## 4 ASV7 (Comamonadaceae; Candidat… 27.9 -1.54 0.832 5.01e-3 1.45e-2
## 5 ASV8 (Methylophilaceae; Methyl… 27.8 -1.01 0.592 1.27e-2 2.61e-2
## 6 ASV9 (Rhizobiaceae; NA) 23.3 -1.70 0.715 1.52e-3 5.79e-3
## 7 ASV11 (Azospirillaceae; Azospi… 21.8 -1.02 0.675 1.69e-2 3.12e-2
## 8 ASV15 (Azospirillaceae; Azospi… 12.8 -1.15 0.724 1.28e-2 2.61e-2
## 9 ASV16 (Caulobacteraceae; Pheny… 11.6 -1.75 0.603 3.31e-4 2.24e-3
## 10 ASV17 (Rhizobiaceae; Allorhizo… 10.7 -1.97 0.782 9.15e-4 3.98e-3
## # ℹ 29 more rows
nrow(sig_inoc_Nod_vs_Root)
## [1] 39
res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root_tb %>% mutate(threshold_NodRoot = padj < 0.05 & abs(log2FoldChange) >= 1)
res_tissue_Nod_vs_Root_tb$label <- res_tissue_Nod_vs_Root_tb$ASV
res_tissue_Nod_vs_Root_tb$label <- gsub("[A-Za-z]+; ", "", res_tissue_Nod_vs_Root_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Apply the function made above to create a new_label column
res_tissue_Nod_vs_Root_tb$new_label <- mapply(modify_label, res_tissue_Nod_vs_Root_tb$ASV, res_tissue_Nod_vs_Root_tb$label)
## Remove the label if it does not meet the threshold
res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root_tb %>% mutate(new_label = case_when(threshold_NodRoot == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_tissue_Nod_vs_Root_tb$new_label_2 <- gsub(" \\(.*", "", res_tissue_Nod_vs_Root_tb$ASV)
## Remove the label if it does not meet the threshold
res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root_tb %>% mutate(new_label_2 = case_when(threshold_NodRoot == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_tissue_Nod_vs_Root_tb$new_label_4 <- res_tissue_Nod_vs_Root_tb$new_label_2
res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root_tb %>% mutate(new_label_4 = ifelse(
ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)",
list("ASV5", new_label_2),
ifelse(
ASV == "ASV28 (Bacillaceae; Bacillus)",
list("ASV28", new_label_2),
ifelse(
ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
list("ASV1", new_label_2),
new_label_2
)
)
))
volcano.inoc_Nod_vs_Root <- ggplot(res_tissue_Nod_vs_Root_tb, aes(x = log2FoldChange, y = -log10(padj))) +
geom_point(aes(colour = threshold_NodRoot)) +
geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
xlab("log2 fold change") +
ylab("-log10 adjusted p-value") +
theme_linedraw() +
ggtitle("Nodule vs Root") +
theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.inoc_Nod_vs_Root
## Warning: Removed 143 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
Surface sterilization does not significantly affect ASV abundances.
Full model: design = ~ Inoculum + Tissue + Surface.
Reduced model: design = ~ Inoculum + Tissue
dds_lrt_full.vs.noSurface <- DESeq(dds, test="LRT", reduced = ~ Inoculum + Tissue)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
res_LRT_full.vs.noSurface <- results(dds_lrt_full.vs.noSurface)
# Create a tibble for LRT results
res_LRT_full.vs.noSurface_tb <- res_LRT_full.vs.noSurface %>%
data.frame() %>%
rownames_to_column(var="ASV") %>%
as_tibble()
write.csv(res_LRT_full.vs.noSurface_tb, "./16S/LRT_test_results_Surface.csv", row.names = F)
# Subset to return ASVs with padj < 0.05
padj.cutoff <- 0.05 # Set alpha to 0.05
sigLRT_ASVs_full.vs.noSurface <- res_LRT_full.vs.noSurface_tb %>%
filter(padj < padj.cutoff)
insigLRT_ASVs_full.vs.noSurface <- res_LRT_full.vs.noSurface_tb %>%
filter(padj >= padj.cutoff)
sigLRT_ASVs_full.vs.noSurface
## # A tibble: 47 × 7
## ASV baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV4 (Comamonadaceae; … 85.5 -2.70 0.573 11.0 9.16e- 4 3.78e- 3
## 2 ASV5 (Pseudomonadaceae… 59.4 -2.35 0.635 12.1 5.02e- 4 2.49e- 3
## 3 ASV6 (Rhizobiaceae; NA) 39.2 -4.89 0.563 50.9 9.58e-13 3.35e-11
## 4 ASV7 (Comamonadaceae; … 27.9 -2.38 0.581 8.86 2.92e- 3 1.02e- 2
## 5 ASV8 (Methylophilaceae… 27.8 -5.11 0.535 74.9 4.90e-18 5.15e-16
## 6 ASV9 (Rhizobiaceae; NA) 23.3 -3.29 0.563 22.5 2.13e- 6 1.83e- 5
## 7 ASV11 (Azospirillaceae… 21.8 -4.80 0.574 54.2 1.80e-13 9.46e-12
## 8 ASV14 (Sphingomonadace… 14.3 -3.61 0.517 35.6 2.38e- 9 5.00e- 8
## 9 ASV15 (Azospirillaceae… 12.8 -2.10 0.565 8.49 3.58e- 3 1.17e- 2
## 10 ASV16 (Caulobacteracea… 11.6 -3.71 0.509 44.4 2.67e-11 7.01e-10
## # ℹ 37 more rows
nrow(sigLRT_ASVs_full.vs.noSurface)
## [1] 47
insigLRT_ASVs_full.vs.noSurface
## # A tibble: 58 × 7
## ASV baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorh… 2154. -0.359 0.687 0.210 0.647 0.647
## 2 ASV61 (Caulobacteraceae; C… 2.10 -1.08 0.470 4.86 0.0275 0.0603
## 3 ASV64 (Rhizobiales Incerta… 1.95 -1.02 0.460 4.65 0.0310 0.0652
## 4 ASV65 (Xanthomonadaceae; P… 1.95 -1.03 0.456 4.81 0.0282 0.0605
## 5 ASV69 (Flavobacteriaceae; … 1.84 -0.968 0.455 4.36 0.0368 0.0757
## 6 ASV71 (Rhizobiaceae; NA) 1.77 -0.933 0.447 4.25 0.0392 0.0792
## 7 ASV74 (Reyranellaceae; Rey… 1.72 -0.894 0.450 3.87 0.0492 0.0957
## 8 ASV81 (Caulobacteraceae; A… 1.55 -0.566 0.417 1.84 0.175 0.259
## 9 ASV82 (Xanthobacteraceae; … 1.53 -0.865 0.428 4.16 0.0413 0.0819
## 10 ASV84 (Burkholderiaceae; L… 1.54 -0.753 0.452 2.74 0.0977 0.180
## # ℹ 48 more rows
nrow(insigLRT_ASVs_full.vs.noSurface)
## [1] 58
The differences in abundance of 47 ASVs across inoculum treatment are significant. Of those 47, which ones are differentially abundant in the Nodule samples compared to the Root samples?
res_tissue_Sterile_vs_Not <- results(dds_lrt_full.vs.noSurface, contrast = c("Surface", "Surface-sterilized", "Not surface-sterilized"), alpha = 0.05) # Baseline is Not surface-sterilized
head(res_tissue_Sterile_vs_Not %>% as.data.frame())
## baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.84652 -0.3593853
## ASV4 (Comamonadaceae; NA) 85.46005 -2.7042808
## ASV5 (Pseudomonadaceae; Pseudomonas) 59.39209 -2.3521719
## ASV6 (Rhizobiaceae; NA) 39.20911 -4.8897374
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 27.92732 -2.3764809
## ASV8 (Methylophilaceae; Methylotenera) 27.80852 -5.1073218
## lfcSE stat
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.6874250 0.2097697
## ASV4 (Comamonadaceae; NA) 0.5728428 10.9906257
## ASV5 (Pseudomonadaceae; Pseudomonas) 0.6349587 12.1099015
## ASV6 (Rhizobiaceae; NA) 0.5634659 50.9278125
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.5810043 8.8564065
## ASV8 (Methylophilaceae; Methylotenera) 0.5345081 74.9194790
## pvalue padj
## ASV1 (Rhizobiaceae; Sinorhizobium) 6.469479e-01 6.469479e-01
## ASV4 (Comamonadaceae; NA) 9.157389e-04 2.337995e-03
## ASV5 (Pseudomonadaceae; Pseudomonas) 5.015479e-04 1.540387e-03
## ASV6 (Rhizobiaceae; NA) 9.582611e-13 2.076232e-11
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 2.920620e-03 6.328010e-03
## ASV8 (Methylophilaceae; Methylotenera) 4.903086e-18 3.187006e-16
# Apply fold change shrinkage
res_tissue_Sterile_vs_Not <- lfcShrink(dds_lrt_full.vs.noSurface, coef="Surface_Surface.sterilized_vs_Not.surface.sterilized")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_tissue_Sterile_vs_Not
## log2 fold change (MAP): Surface Surface.sterilized vs Not.surface.sterilized
## LRT p-value: '~ Inoculum + Tissue + Surface' vs '~ Inoculum + Tissue'
## DataFrame with 204 rows and 5 columns
## baseMean log2FoldChange
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.8465 -0.187296
## ASV4 (Comamonadaceae; NA) 85.4601 -2.157491
## ASV5 (Pseudomonadaceae; Pseudomonas) 59.3921 -1.731137
## ASV6 (Rhizobiaceae; NA) 39.2091 -4.697198
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 27.9273 -1.846352
## ... ... ...
## ASV243 (NA; NA) 1.05099 0.00933195
## ASV244 (NA; NA) 1.05099 0.00933195
## ASV245 (NA; NA) 1.05099 0.00933195
## ASV246 (NA; NA) 1.05099 0.00933195
## ASV248 (Comamonadaceae; NA) 1.05099 0.00933195
## lfcSE pvalue
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.576363 6.46948e-01
## ASV4 (Comamonadaceae; NA) 0.854205 9.15739e-04
## ASV5 (Pseudomonadaceae; Pseudomonas) 0.953002 5.01548e-04
## ASV6 (Rhizobiaceae; NA) 0.654391 9.58261e-13
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.806849 2.92062e-03
## ... ... ...
## ASV243 (NA; NA) 0.3866 0.979122
## ASV244 (NA; NA) 0.3866 0.979122
## ASV245 (NA; NA) 0.3866 0.979122
## ASV246 (NA; NA) 0.3866 0.979122
## ASV248 (Comamonadaceae; NA) 0.3866 0.979122
## padj
## <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 6.46948e-01
## ASV4 (Comamonadaceae; NA) 3.77676e-03
## ASV5 (Pseudomonadaceae; Pseudomonas) 2.48832e-03
## ASV6 (Rhizobiaceae; NA) 3.35391e-11
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 1.02222e-02
## ... ...
## ASV243 (NA; NA) NA
## ASV244 (NA; NA) NA
## ASV245 (NA; NA) NA
## ASV246 (NA; NA) NA
## ASV248 (Comamonadaceae; NA) NA
summary(res_tissue_Sterile_vs_Not, alpha = 0.05)
##
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1, 0.49%
## LFC < 0 (down) : 46, 23%
## outliers [1] : 20, 9.8%
## low counts [2] : 79, 39%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not %>%
as.data.frame() %>%
rownames_to_column(var="ASV") %>%
as_tibble()
write.csv(res_tissue_Sterile_vs_Not_tb, "./16S/test_results_Surface_Sterile_vs_NotSterile_shrunken_LFC.csv", row.names = F)
head(res_tissue_Sterile_vs_Not_tb)
## # A tibble: 6 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizo… 2154. -0.187 0.576 6.47e- 1 6.47e- 1
## 2 ASV4 (Comamonadaceae; NA) 85.5 -2.16 0.854 9.16e- 4 3.78e- 3
## 3 ASV5 (Pseudomonadaceae; Pseud… 59.4 -1.73 0.953 5.02e- 4 2.49e- 3
## 4 ASV6 (Rhizobiaceae; NA) 39.2 -4.70 0.654 9.58e-13 3.35e-11
## 5 ASV7 (Comamonadaceae; Candida… 27.9 -1.85 0.807 2.92e- 3 1.02e- 2
## 6 ASV8 (Methylophilaceae; Methy… 27.8 -4.99 0.546 4.90e-18 5.15e-16
sig_inoc_Sterile_vs_Not <- res_tissue_Sterile_vs_Not_tb %>% filter(padj < 0.05)
write.csv(sig_inoc_Sterile_vs_Not, "./16S/significant_results_Surface_Sterile_vs_NotSterile_shrunken_LFC.csv", row.names = F)
sig_inoc_Sterile_vs_Not
## # A tibble: 47 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV4 (Comamonadaceae; NA) 85.5 -2.16 0.854 9.16e- 4 3.78e- 3
## 2 ASV5 (Pseudomonadaceae; Pseu… 59.4 -1.73 0.953 5.02e- 4 2.49e- 3
## 3 ASV6 (Rhizobiaceae; NA) 39.2 -4.70 0.654 9.58e-13 3.35e-11
## 4 ASV7 (Comamonadaceae; Candid… 27.9 -1.85 0.807 2.92e- 3 1.02e- 2
## 5 ASV8 (Methylophilaceae; Meth… 27.8 -4.99 0.546 4.90e-18 5.15e-16
## 6 ASV9 (Rhizobiaceae; NA) 23.3 -3.00 0.688 2.13e- 6 1.83e- 5
## 7 ASV11 (Azospirillaceae; Azos… 21.8 -4.64 0.614 1.80e-13 9.46e-12
## 8 ASV14 (Sphingomonadaceae; No… 14.3 -3.41 0.586 2.38e- 9 5.00e- 8
## 9 ASV15 (Azospirillaceae; Azos… 12.8 -1.66 0.725 3.58e- 3 1.17e- 2
## 10 ASV16 (Caulobacteraceae; Phe… 11.6 -3.55 0.547 2.67e-11 7.01e-10
## # ℹ 37 more rows
nrow(sig_inoc_Sterile_vs_Not)
## [1] 47
res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not_tb %>% mutate(threshold_Sterile = padj < 0.05 & abs(log2FoldChange) >= 1)
res_tissue_Sterile_vs_Not_tb$label <- res_tissue_Sterile_vs_Not_tb$ASV
res_tissue_Sterile_vs_Not_tb$label <- gsub("[A-Za-z]+; ", "", res_tissue_Sterile_vs_Not_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Apply the function made above to create a new_label column
res_tissue_Sterile_vs_Not_tb$new_label <- mapply(modify_label, res_tissue_Sterile_vs_Not_tb$ASV, res_tissue_Sterile_vs_Not_tb$label)
## Remove the label if it does not meet the threshold
res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not_tb %>% mutate(new_label = case_when(threshold_Sterile == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_tissue_Sterile_vs_Not_tb$new_label_2 <- gsub(" \\(.*", "", res_tissue_Sterile_vs_Not_tb$ASV)
## Remove the label if it does not meet the threshold
res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not_tb %>% mutate(new_label_2 = case_when(threshold_Sterile == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_tissue_Sterile_vs_Not_tb$new_label_4 <- res_tissue_Sterile_vs_Not_tb$new_label_2
res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not_tb %>% mutate(new_label_4 = ifelse(
ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)",
list("ASV5", new_label_2),
ifelse(
ASV == "ASV28 (Bacillaceae; Bacillus)",
list("ASV28", new_label_2),
ifelse(
ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
list("ASV1", new_label_2),
new_label_2
)
)
))
volcano.inoc_Sterile_vs_Not <- ggplot(res_tissue_Sterile_vs_Not_tb, aes(x = log2FoldChange, y = -log10(padj))) +
geom_point(aes(colour = threshold_Sterile)) +
geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
xlab("log2 fold change") +
ylab("-log10 adjusted p-value") +
theme_linedraw() +
ggtitle("Surface-sterilized vs not surface-sterilized") +
theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.inoc_Sterile_vs_Not
## Warning: Removed 99 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
I can also model and test the abundances of ASVs as influenced by the interaction of Tissue and Surface. This would be useful because I could contrast groups of ASVs in the same Tissue treatment but different Surface treatments.
dds_interact <- dds
dds_interact$TissueSurface <- factor(paste0(dds_interact$Tissue, dds_interact$Surface))
design(dds_interact) <- ~ TissueSurface
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
dds_interact <- DESeq(dds_interact)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 34 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
The interactions of Tissue and Surface does not significantly affect ASV abundances.
Full model: design = ~TissueSurface.
Reduced model: design = ~ 1 (the intercept)
dds_lrt_Interact.vs.Intercept <- DESeq(dds_interact, test="LRT", reduced = ~ 1) # reduced model is just the intercept
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 34 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
res_LRT_Interact.vs.Intercept <- results(dds_lrt_Interact.vs.Intercept)
# Create a tibble for LRT results
res_LRT_Interact.vs.Intercept_tb <- res_LRT_Interact.vs.Intercept %>%
data.frame() %>%
rownames_to_column(var="ASV") %>%
as_tibble()
write.csv(res_LRT_Interact.vs.Intercept_tb, "./16S/LRT_test_results_Interact_vs_Intercept.csv", row.names = F)
# Subset to return ASVs with padj < 0.05
padj.cutoff <- 0.05 # Set alpha to 0.05
sigLRT_ASVs_Interact.vs.Intercept <- res_LRT_Interact.vs.Intercept_tb %>%
filter(padj < padj.cutoff)
insigLRT_ASVs_Interact.vs.Intercept <- res_LRT_Interact.vs.Intercept_tb %>%
filter(padj >= padj.cutoff)
sigLRT_ASVs_Interact.vs.Intercept
## # A tibble: 60 × 7
## ASV baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Si… 2154. -2.62 0.987 12.2 6.88e- 3 1.40e- 2
## 2 ASV4 (Comamonadaceae; … 27.1 -6.28 1.03 48.4 1.78e-10 9.21e-10
## 3 ASV5 (Pseudomonadaceae… 7.36 -4.64 0.791 71.4 2.09e-15 1.58e-14
## 4 ASV6 (Rhizobiaceae; NA) 39.2 -2.64 0.795 136. 3.25e-29 3.18e-27
## 5 ASV7 (Comamonadaceae; … 8.89 -1.22 0.881 59.5 7.64e-13 4.41e-12
## 6 ASV8 (Methylophilaceae… 27.8 -4.03 0.737 81.5 1.49e-17 1.62e-16
## 7 ASV9 (Rhizobiaceae; NA) 23.3 -1.66 0.873 97.0 6.85e-21 1.12e-19
## 8 ASV10 (Comamonadaceae;… 3.58 0.00540 0.844 43.1 2.32e- 9 9.88e- 9
## 9 ASV11 (Azospirillaceae… 21.8 -3.75 0.814 83.0 6.89e-18 8.44e-17
## 10 ASV13 (Rhizobiaceae; N… 4.28 -3.57 0.732 41.2 5.99e- 9 2.17e- 8
## # ℹ 50 more rows
nrow(sigLRT_ASVs_Interact.vs.Intercept)
## [1] 60
insigLRT_ASVs_Interact.vs.Intercept
## # A tibble: 38 × 7
## ASV baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV18 (Sphingomonadaceae; … 1.57 -0.657 0.654 6.05 0.109 0.151
## 2 ASV32 (Beijerinckiaceae; B… 1.44 0.00540 0.682 8.27 0.0408 0.0624
## 3 ASV36 (Comamonadaceae; Pse… 1.53 -0.731 0.626 5.11 0.164 0.208
## 4 ASV40 (Sphingomonadaceae; … 1.42 -1.27 0.578 8.46 0.0373 0.0581
## 5 ASV42 (Comamonadaceae; Pel… 1.31 -0.994 0.582 4.94 0.176 0.218
## 6 ASV45 (Xanthobacteraceae; … 1.61 -0.579 0.668 7.41 0.0600 0.0891
## 7 ASV61 (Caulobacteraceae; C… 1.29 0.00540 0.647 4.20 0.241 0.281
## 8 ASV94 (Xanthobacteraceae; … 1.46 0.00540 0.686 8.81 0.0320 0.0505
## 9 ASV95 (Beijerinckiaceae; B… 1.46 0.00540 0.687 8.83 0.0316 0.0505
## 10 ASV97 (Stappiaceae; NA) 1.40 0.00540 0.655 7.65 0.0538 0.0811
## # ℹ 28 more rows
nrow(insigLRT_ASVs_Interact.vs.Intercept)
## [1] 38
res_interact_Nod_Sterile_vs_Nod_Not <- results(dds_lrt_Interact.vs.Intercept, contrast = c("TissueSurface", "NoduleSurface sterilized", "NoduleNot surface-sterilized"), alpha = 0.05) # Baseline is not surface-sterilized Nodule samples
head(res_interact_Nod_Sterile_vs_Nod_Not %>% as.data.frame())
## baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.846522 0.2294456
## ASV4 (Comamonadaceae; NA) 27.144536 -6.2809012
## ASV5 (Pseudomonadaceae; Pseudomonas) 7.363803 -4.6396968
## ASV6 (Rhizobiaceae; NA) 39.209107 -2.6365163
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 8.892439 -1.2156154
## ASV8 (Methylophilaceae; Methylotenera) 27.808524 -4.9053165
## lfcSE stat pvalue
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.9868250 12.15318 6.876427e-03
## ASV4 (Comamonadaceae; NA) 1.0320566 48.36033 1.784799e-10
## ASV5 (Pseudomonadaceae; Pseudomonas) 0.7909397 71.44332 2.094871e-15
## ASV6 (Rhizobiaceae; NA) 0.7949539 135.66803 3.246443e-29
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.8805514 59.46638 7.642626e-13
## ASV8 (Methylophilaceae; Methylotenera) 0.7885698 81.46230 1.490531e-17
## padj
## ASV1 (Rhizobiaceae; Sinorhizobium) 1.160397e-02
## ASV4 (Comamonadaceae; NA) 7.608882e-10
## ASV5 (Pseudomonadaceae; Pseudomonas) 1.305266e-14
## ASV6 (Rhizobiaceae; NA) 2.629619e-27
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 3.641487e-12
## ASV8 (Methylophilaceae; Methylotenera) 1.341478e-16
# Apply fold change shrinkage
res_interact_Nod_Sterile_vs_Nod_Not <- lfcShrink(dds_lrt_Interact.vs.Intercept, coef="TissueSurface_NoduleSurface.sterilized_vs_NoduleNot.surface.sterilized")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_interact_Nod_Sterile_vs_Nod_Not
## log2 fold change (MAP): TissueSurface NoduleSurface.sterilized vs NoduleNot.surface.sterilized
## LRT p-value: '~ TissueSurface' vs '~ 1'
## DataFrame with 204 rows and 5 columns
## baseMean log2FoldChange
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.84652 0.0118120
## ASV4 (Comamonadaceae; NA) 27.14454 -7.1927280
## ASV5 (Pseudomonadaceae; Pseudomonas) 7.36380 -7.6231874
## ASV6 (Rhizobiaceae; NA) 39.20911 -2.0349494
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 8.89244 -0.0758511
## ... ... ...
## ASV243 (NA; NA) 1.05099 -4.36247e-05
## ASV244 (NA; NA) 1.05099 -4.36247e-05
## ASV245 (NA; NA) 1.05099 -4.36247e-05
## ASV246 (NA; NA) 1.05099 -4.36247e-05
## ASV248 (Comamonadaceae; NA) 1.05099 -4.36247e-05
## lfcSE pvalue
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.215657 6.87643e-03
## ASV4 (Comamonadaceae; NA) 1.051241 1.78480e-10
## ASV5 (Pseudomonadaceae; Pseudomonas) 0.791297 2.09487e-15
## ASV6 (Rhizobiaceae; NA) 0.930714 3.24644e-29
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.232491 7.64263e-13
## ... ... ...
## ASV243 (NA; NA) 0.207625 0.999991
## ASV244 (NA; NA) 0.207625 0.999991
## ASV245 (NA; NA) 0.207625 0.999991
## ASV246 (NA; NA) 0.207625 0.999991
## ASV248 (Comamonadaceae; NA) 0.207625 0.999991
## padj
## <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 1.40394e-02
## ASV4 (Comamonadaceae; NA) 9.20581e-10
## ASV5 (Pseudomonadaceae; Pseudomonas) 1.57921e-14
## ASV6 (Rhizobiaceae; NA) 3.18151e-27
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 4.40575e-12
## ... ...
## ASV243 (NA; NA) NA
## ASV244 (NA; NA) NA
## ASV245 (NA; NA) NA
## ASV246 (NA; NA) NA
## ASV248 (Comamonadaceae; NA) NA
summary(res_interact_Nod_Sterile_vs_Nod_Not, alpha = 0.05)
##
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 31, 15%
## LFC < 0 (down) : 29, 14%
## outliers [1] : 0, 0%
## low counts [2] : 106, 52%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not %>%
as.data.frame() %>%
rownames_to_column(var="ASV") %>%
as_tibble()
write.csv(res_interact_Nod_Sterile_vs_Nod_Not_tb, "./16S/test_results_Nod_Sterile_vs_Nod_NotSterile_shrunken_LFC.csv", row.names = F)
head(res_interact_Nod_Sterile_vs_Nod_Not_tb)
## # A tibble: 6 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizo… 2154. 0.0118 0.216 6.88e- 3 1.40e- 2
## 2 ASV4 (Comamonadaceae; NA) 27.1 -7.19 1.05 1.78e-10 9.21e-10
## 3 ASV5 (Pseudomonadaceae; Pseud… 7.36 -7.62 0.791 2.09e-15 1.58e-14
## 4 ASV6 (Rhizobiaceae; NA) 39.2 -2.03 0.931 3.25e-29 3.18e-27
## 5 ASV7 (Comamonadaceae; Candida… 8.89 -0.0759 0.232 7.64e-13 4.41e-12
## 6 ASV8 (Methylophilaceae; Methy… 27.8 -4.64 0.807 1.49e-17 1.62e-16
sig_interact_Nod_Sterile_vs_Nod_Not <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% filter(padj < 0.05)
write.csv(sig_interact_Nod_Sterile_vs_Nod_Not, "./16S/significant_results_sig_interact_Nod_Sterile_vs_Nod_Not_shrunken_LFC.csv", row.names = F)
sig_interact_Nod_Sterile_vs_Nod_Not
## # A tibble: 60 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhiz… 2154. 0.0118 0.216 6.88e- 3 1.40e- 2
## 2 ASV4 (Comamonadaceae; NA) 27.1 -7.19 1.05 1.78e-10 9.21e-10
## 3 ASV5 (Pseudomonadaceae; Pseu… 7.36 -7.62 0.791 2.09e-15 1.58e-14
## 4 ASV6 (Rhizobiaceae; NA) 39.2 -2.03 0.931 3.25e-29 3.18e-27
## 5 ASV7 (Comamonadaceae; Candid… 8.89 -0.0759 0.232 7.64e-13 4.41e-12
## 6 ASV8 (Methylophilaceae; Meth… 27.8 -4.64 0.807 1.49e-17 1.62e-16
## 7 ASV9 (Rhizobiaceae; NA) 23.3 -0.109 0.254 6.85e-21 1.12e-19
## 8 ASV10 (Comamonadaceae; Azohy… 3.58 0.0000515 0.213 2.32e- 9 9.88e- 9
## 9 ASV11 (Azospirillaceae; Azos… 21.8 -3.35 0.860 6.89e-18 8.44e-17
## 10 ASV13 (Rhizobiaceae; NA) 4.28 -3.24 0.763 5.99e- 9 2.17e- 8
## # ℹ 50 more rows
nrow(sig_interact_Nod_Sterile_vs_Nod_Not)
## [1] 60
res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% mutate(threshold_TissueSurface = padj < 0.05 & abs(log2FoldChange) >= 1)
res_interact_Nod_Sterile_vs_Nod_Not_tb$label <- res_interact_Nod_Sterile_vs_Nod_Not_tb$ASV
res_interact_Nod_Sterile_vs_Nod_Not_tb$label <- gsub("[A-Za-z]+; ", "", res_interact_Nod_Sterile_vs_Nod_Not_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Apply the function made above to create a new_label column
res_interact_Nod_Sterile_vs_Nod_Not_tb$new_label <- mapply(modify_label, res_interact_Nod_Sterile_vs_Nod_Not_tb$ASV, res_interact_Nod_Sterile_vs_Nod_Not_tb$label)
## Remove the label if it does not meet the threshold
res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% mutate(new_label = case_when(threshold_TissueSurface == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_interact_Nod_Sterile_vs_Nod_Not_tb$new_label_2 <- gsub(" \\(.*", "", res_interact_Nod_Sterile_vs_Nod_Not_tb$ASV)
## Remove the label if it does not meet the threshold
res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% mutate(new_label_2 = case_when(threshold_TissueSurface == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_interact_Nod_Sterile_vs_Nod_Not_tb$new_label_4 <- res_interact_Nod_Sterile_vs_Nod_Not_tb$new_label_2
res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% mutate(new_label_4 = ifelse(
ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)",
list("ASV5", new_label_2),
ifelse(
ASV == "ASV28 (Bacillaceae; Bacillus)",
list("ASV28", new_label_2),
ifelse(
ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
list("ASV1", new_label_2),
new_label_2
)
)
))
volcano.interact_NodSterile_vs_NodNot <- ggplot(res_interact_Nod_Sterile_vs_Nod_Not_tb, aes(x = log2FoldChange, y = -log10(padj))) +
geom_point(aes(colour = threshold_TissueSurface)) +
geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
xlab("log2 fold change") +
ylab("-log10 adjusted p-value") +
theme_linedraw() +
ggtitle("Nodule, surface-sterilized vs nodule, not surface-sterilized") +
theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.interact_NodSterile_vs_NodNot
## Warning: Removed 106 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
This requires me to re-factor the “Group” variable so I can change the base for the comparisons:
dds_interact$TissueSurface <- relevel(dds_interact$TissueSurface, ref = "RootNot surface-sterilized")
dds_lrt_Interact.vs.Intercept <- DESeq(dds_interact, test="LRT", reduced = ~ 1) # reduced model is just the intercept
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 34 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
res_interact_Root_Sterile_vs_Root_Not <- results(dds_lrt_Interact.vs.Intercept, contrast = c("TissueSurface", "RootSurface sterilized", "RootNot surface-sterilized"), alpha = 0.05) # Baseline is not surface-sterilized Root samples
head(res_interact_Root_Sterile_vs_Root_Not %>% as.data.frame())
## baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.846522 -0.33507335
## ASV4 (Comamonadaceae; NA) 27.144536 -4.53231774
## ASV5 (Pseudomonadaceae; Pseudomonas) 7.363803 0.01739461
## ASV6 (Rhizobiaceae; NA) 39.209107 -7.13371069
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 8.892439 -4.87741815
## ASV8 (Methylophilaceae; Methylotenera) 27.808524 -5.30882147
## lfcSE stat pvalue
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.9870736 12.15318 6.876427e-03
## ASV4 (Comamonadaceae; NA) 1.0345649 48.36033 1.784799e-10
## ASV5 (Pseudomonadaceae; Pseudomonas) 0.8900004 71.44332 2.094871e-15
## ASV6 (Rhizobiaceae; NA) 0.7780653 135.66803 3.246443e-29
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.8407156 59.46638 7.642626e-13
## ASV8 (Methylophilaceae; Methylotenera) 0.7346109 81.46230 1.490531e-17
## padj
## ASV1 (Rhizobiaceae; Sinorhizobium) 1.160397e-02
## ASV4 (Comamonadaceae; NA) 7.608882e-10
## ASV5 (Pseudomonadaceae; Pseudomonas) 1.305266e-14
## ASV6 (Rhizobiaceae; NA) 2.629619e-27
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 3.641487e-12
## ASV8 (Methylophilaceae; Methylotenera) 1.341478e-16
# Save the unshrunken results to compare
res_interact_Root_Sterile_vs_Root_Not_unshrunken <- res_interact_Root_Sterile_vs_Root_Not
# Apply fold change shrinkage
res_interact_Root_Sterile_vs_Root_Not <- lfcShrink(dds_lrt_Interact.vs.Intercept, coef="TissueSurface_RootSurface.sterilized_vs_RootNot.surface.sterilized")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_interact_Root_Sterile_vs_Root_Not
## log2 fold change (MAP): TissueSurface RootSurface.sterilized vs RootNot.surface.sterilized
## LRT p-value: '~ TissueSurface' vs '~ 1'
## DataFrame with 204 rows and 5 columns
## baseMean log2FoldChange
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 2153.84652 -0.17252697
## ASV4 (Comamonadaceae; NA) 27.14454 -6.82583471
## ASV5 (Pseudomonadaceae; Pseudomonas) 7.36380 0.00649048
## ASV6 (Rhizobiaceae; NA) 39.20911 -6.95908415
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 8.89244 -6.44955796
## ... ... ...
## ASV243 (NA; NA) 1.05099 0.0168483
## ASV244 (NA; NA) 1.05099 0.0168483
## ASV245 (NA; NA) 1.05099 0.0168483
## ASV246 (NA; NA) 1.05099 0.0168483
## ASV248 (Comamonadaceae; NA) 1.05099 0.0168483
## lfcSE pvalue
## <numeric> <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 0.716681 6.87643e-03
## ASV4 (Comamonadaceae; NA) 1.051229 1.78480e-10
## ASV5 (Pseudomonadaceae; Pseudomonas) 0.670480 2.09487e-15
## ASV6 (Rhizobiaceae; NA) 0.782577 3.24644e-29
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.846666 7.64263e-13
## ... ... ...
## ASV243 (NA; NA) 0.527531 0.999991
## ASV244 (NA; NA) 0.527531 0.999991
## ASV245 (NA; NA) 0.527531 0.999991
## ASV246 (NA; NA) 0.527531 0.999991
## ASV248 (Comamonadaceae; NA) 0.527531 0.999991
## padj
## <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium) 1.40394e-02
## ASV4 (Comamonadaceae; NA) 9.20581e-10
## ASV5 (Pseudomonadaceae; Pseudomonas) 1.57921e-14
## ASV6 (Rhizobiaceae; NA) 3.18151e-27
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 4.40575e-12
## ... ...
## ASV243 (NA; NA) NA
## ASV244 (NA; NA) NA
## ASV245 (NA; NA) NA
## ASV246 (NA; NA) NA
## ASV248 (Comamonadaceae; NA) NA
summary(res_interact_Root_Sterile_vs_Root_Not, alpha = 0.05)
##
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 7, 3.4%
## LFC < 0 (down) : 53, 26%
## outliers [1] : 0, 0%
## low counts [2] : 106, 52%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not %>%
as.data.frame() %>%
rownames_to_column(var="ASV") %>%
as_tibble()
write.csv(res_interact_Root_Sterile_vs_Root_Not_tb, "./16S/test_results_Root_Sterile_vs_Root_NotSterile_shrunken_LFC.csv", row.names = F)
head(res_interact_Root_Sterile_vs_Root_Not_tb)
## # A tibble: 6 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizo… 2154. -0.173 0.717 6.88e- 3 1.40e- 2
## 2 ASV4 (Comamonadaceae; NA) 27.1 -6.83 1.05 1.78e-10 9.21e-10
## 3 ASV5 (Pseudomonadaceae; Pseud… 7.36 0.00649 0.670 2.09e-15 1.58e-14
## 4 ASV6 (Rhizobiaceae; NA) 39.2 -6.96 0.783 3.25e-29 3.18e-27
## 5 ASV7 (Comamonadaceae; Candida… 8.89 -6.45 0.847 7.64e-13 4.41e-12
## 6 ASV8 (Methylophilaceae; Methy… 27.8 -5.11 0.746 1.49e-17 1.62e-16
sig_interact_Root_Sterile_vs_Root_Not <- res_interact_Root_Sterile_vs_Root_Not_tb %>% filter(padj < 0.05)
write.csv(sig_interact_Root_Sterile_vs_Root_Not, "./16S/significant_results_interact_Root_Sterile_vs_Root_Not_shrunken_LFC.csv", row.names = F)
sig_interact_Root_Sterile_vs_Root_Not
## # A tibble: 60 × 6
## ASV baseMean log2FoldChange lfcSE pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhiz… 2154. -0.173 0.717 6.88e- 3 1.40e- 2
## 2 ASV4 (Comamonadaceae; NA) 27.1 -6.83 1.05 1.78e-10 9.21e-10
## 3 ASV5 (Pseudomonadaceae; Pseu… 7.36 0.00649 0.670 2.09e-15 1.58e-14
## 4 ASV6 (Rhizobiaceae; NA) 39.2 -6.96 0.783 3.25e-29 3.18e-27
## 5 ASV7 (Comamonadaceae; Candid… 8.89 -6.45 0.847 7.64e-13 4.41e-12
## 6 ASV8 (Methylophilaceae; Meth… 27.8 -5.11 0.746 1.49e-17 1.62e-16
## 7 ASV9 (Rhizobiaceae; NA) 23.3 -6.15 0.852 6.85e-21 1.12e-19
## 8 ASV10 (Comamonadaceae; Azohy… 3.58 -6.19 0.739 2.32e- 9 9.88e- 9
## 9 ASV11 (Azospirillaceae; Azos… 21.8 -5.86 0.817 6.89e-18 8.44e-17
## 10 ASV13 (Rhizobiaceae; NA) 4.28 -5.60 0.729 5.99e- 9 2.17e- 8
## # ℹ 50 more rows
nrow(sig_interact_Root_Sterile_vs_Root_Not)
## [1] 60
res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not_tb %>% mutate(threshold_TissueSurface = padj < 0.05 & abs(log2FoldChange) >= 1)
res_interact_Root_Sterile_vs_Root_Not_tb$label <- res_interact_Root_Sterile_vs_Root_Not_tb$ASV
res_interact_Root_Sterile_vs_Root_Not_tb$label <- gsub("[A-Za-z]+; ", "", res_interact_Root_Sterile_vs_Root_Not_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Apply the function made above to create a new_label column
res_interact_Root_Sterile_vs_Root_Not_tb$new_label <- mapply(modify_label, res_interact_Root_Sterile_vs_Root_Not_tb$ASV, res_interact_Root_Sterile_vs_Root_Not_tb$label)
## Remove the label if it does not meet the threshold
res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not_tb %>% mutate(new_label = case_when(threshold_TissueSurface == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_interact_Root_Sterile_vs_Root_Not_tb$new_label_2 <- gsub(" \\(.*", "", res_interact_Root_Sterile_vs_Root_Not_tb$ASV)
## Remove the label if it does not meet the threshold
res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not_tb %>% mutate(new_label_2 = case_when(threshold_TissueSurface == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_interact_Root_Sterile_vs_Root_Not_tb$new_label_4 <- res_interact_Root_Sterile_vs_Root_Not_tb$new_label_2
res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not_tb %>% mutate(new_label_4 = ifelse(
ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)",
list("ASV5", new_label_2),
ifelse(
ASV == "ASV28 (Bacillaceae; Bacillus)",
list("ASV28", new_label_2),
ifelse(
ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
list("ASV1", new_label_2),
new_label_2
)
)
))
volcano.interact_RootSterile_vs_RootNot <- ggplot(res_interact_Root_Sterile_vs_Root_Not_tb, aes(x = log2FoldChange, y = -log10(padj))) +
geom_point(aes(colour = threshold_TissueSurface)) +
geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
xlab("log2 fold change") +
ylab("-log10 adjusted p-value") +
theme_linedraw() +
ggtitle("Root, surface-sterilized vs root, not surface-sterilized") +
theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.interact_RootSterile_vs_RootNot
## Warning: Removed 106 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
FigS1 <- cowplot::plot_grid(volcano.inoc_717A_vs_141, volcano.inoc_733B_vs_141, volcano.inoc_Nod_vs_Root, volcano.inoc_Sterile_vs_Not, volcano.interact_NodSterile_vs_NodNot, volcano.interact_RootSterile_vs_RootNot, ncol = 2, labels = "AUTO", align = "v", axis = "l", label_size = 24, label_fontfamily = "sans")
## Warning: Removed 127 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 127 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 143 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 99 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 106 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 106 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
FigS1
ggsave("./16S/FigS1.png", plot=FigS1, device = "png", width = 6.5, height = 6.5, units = "in", dpi = 300, scale = 2)
vsd_mat_fullmodel <- assay(vst(dds, blind=TRUE, nsub=sum(rowMeans(counts(dds))>5)))
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
vsd_mat_fullmodel <- vsd_mat_fullmodel[1:50,]
mat_col <- data.frame(metadata[,1:3])
mat_col[colnames(vsd_mat_fullmodel), ] # reorder so it is the same as the vsd matrix
## Inoculum Tissue Surface
## 141_1_R_N 141 Root Not surface-sterilized
## 141_2_R_N 141 Root Not surface-sterilized
## 141_5_R_N 141 Root Not surface-sterilized
## 141_7_R_N 141 Root Not surface-sterilized
## 141_1_R_S 141 Root Surface-sterilized
## 141_2_R_S 141 Root Surface-sterilized
## 141_5_R_S 141 Root Surface-sterilized
## 141_7_R_S 141 Root Surface-sterilized
## 141_1_N_N 141 Nodule Not surface-sterilized
## 141_2_N_N 141 Nodule Not surface-sterilized
## 141_5_N_N 141 Nodule Not surface-sterilized
## 141_7_N_N 141 Nodule Not surface-sterilized
## 141_1_N_S 141 Nodule Surface-sterilized
## 141_2_N_S 141 Nodule Surface-sterilized
## 141_5_N_S 141 Nodule Surface-sterilized
## 141_7_N_S 141 Nodule Surface-sterilized
## 717_10_R_N 717A + 141 Root Not surface-sterilized
## 717_5_R_N 717A + 141 Root Not surface-sterilized
## 717_6_R_N 717A + 141 Root Not surface-sterilized
## 717_9_R_N 717A + 141 Root Not surface-sterilized
## 717_10_R_S 717A + 141 Root Surface-sterilized
## 717_5_R_S 717A + 141 Root Surface-sterilized
## 717_6_R_S 717A + 141 Root Surface-sterilized
## 717_9_R_S 717A + 141 Root Surface-sterilized
## 717_10_N_N 717A + 141 Nodule Not surface-sterilized
## 717_5_N_N 717A + 141 Nodule Not surface-sterilized
## 717_6_N_N 717A + 141 Nodule Not surface-sterilized
## 717_9_N_N 717A + 141 Nodule Not surface-sterilized
## 717_10_N_S 717A + 141 Nodule Surface-sterilized
## 717_5_N_S 717A + 141 Nodule Surface-sterilized
## 717_6_N_S 717A + 141 Nodule Surface-sterilized
## 717_9_N_S 717A + 141 Nodule Surface-sterilized
## 733_10_R_N 733B + 141 Root Not surface-sterilized
## 733_4_R_N 733B + 141 Root Not surface-sterilized
## 733_8_R_N 733B + 141 Root Not surface-sterilized
## 733_9_R_N 733B + 141 Root Not surface-sterilized
## 733_10_R_S 733B + 141 Root Surface-sterilized
## 733_4_R_S 733B + 141 Root Surface-sterilized
## 733_8_N_N 733B + 141 Root Surface-sterilized
## 733_9_R_S 733B + 141 Root Surface-sterilized
## 733_10_N_N 733B + 141 Nodule Not surface-sterilized
## 733_4_N_N 733B + 141 Nodule Not surface-sterilized
## 733_8_R_S 733B + 141 Nodule Not surface-sterilized
## 733_9_N_N 733B + 141 Nodule Not surface-sterilized
## 733_10_N_S 733B + 141 Nodule Surface-sterilized
## 733_4_N_S 733B + 141 Nodule Surface-sterilized
## 733_8_N_S 733B + 141 Nodule Surface-sterilized
## 733_9_N_S 733B + 141 Nodule Surface-sterilized
mat_colors <- list(Inoculum = brewer.pal(7, "Dark2")[1:3], Tissue = brewer.pal(7, "Dark2")[4:5], Surface = brewer.pal(7, "Dark2")[6:7])
names(mat_colors$Inoculum) <- unique(mat_col$Inoculum)
names(mat_colors$Tissue) <- unique(mat_col$Tissue)
names(mat_colors$Surface) <- unique(mat_col$Surface)
Fig4 <- ComplexHeatmap::pheatmap(
mat = vsd_mat_fullmodel,
cluster_cols = TRUE,
cluster_rows = TRUE,
color = magma(8),
border_color = NA,
show_colnames = FALSE,
show_rownames = TRUE,
annotation_col = mat_col,
annotation_colors = mat_colors,
drop_levels = TRUE,
fontsize = 9,
heatmap_legend_param = list(title = "Abundance\n(variance-stabilized)"),
)
Fig4
# Open a PNG graphics device
png(filename = "./16S/Fig4.png", width = 9, height = 6.5, units = "in", res = 300)
# Draw the heatmap
plot(Fig4)
# Close the device
dev.off()
## quartz_off_screen
## 2